library(tidyverse)
# library(dict) # Still not found after installation
library(container) # For Dict class
library(useful) # For simple.impute
library(comprehenr) # For list comprehension
library(GGally)
library(reshape2)
library(gridExtra)
library(gplots)
library(DescTools) # For df summary
library(robustHD) # For df summary
library(caret)
library(effsize) # For Cohen's d
source('tools/wrangle.R')
source('tools/eda.R')
source('tools/engineer.R')
source('tools/split.R')
# SEED = 65466
Here’s the wrangled training set loaded from objects serialized at the end of wrangle_and_split.Rmd.
val_train_X = readRDS("data/val_train_X_wrangled.rds")
val_train_y = readRDS("data/val_train_y_wrangled.rds")
# Merge for easier analysis.
val_train_Xy = merge(
x = val_train_X,
y = val_train_y,
by = 'Id',
all = TRUE
)
str(val_train_Xy)
## 'data.frame': 715 obs. of 81 variables:
## $ Id : chr "1" "10" "1000" "1002" ...
## $ MSSubClass : Factor w/ 16 levels "20","30","40",..: 6 16 1 2 1 9 14 1 5 11 ...
## $ MSZoning : Factor w/ 8 levels "A","C","FV","I",..: 6 6 6 6 6 6 8 6 6 6 ...
## $ LotFrontage : num 65 50 64 60 75 65 21 NA 115 75 ...
## $ LotArea : num 8450 7420 6762 5400 11957 ...
## $ Street : Ord.factor w/ 3 levels "None"<"Grvl"<..: 3 3 3 3 3 3 3 3 3 3 ...
## $ Alley : Ord.factor w/ 3 levels "None"<"Grvl"<..: 1 1 1 1 1 1 1 1 1 1 ...
## $ LotShape : Ord.factor w/ 4 levels "Reg"<"IR1"<"IR2"<..: 1 1 1 1 2 1 1 2 1 1 ...
## $ LandContour : Factor w/ 4 levels "Lvl","Bnk","HLS",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ Utilities : Ord.factor w/ 5 levels "None"<"ELO"<"NoSeWa"<..: 5 5 5 5 5 5 5 5 5 5 ...
## $ LotConfig : Factor w/ 5 levels "Inside","Corner",..: 1 2 1 2 1 1 1 1 1 1 ...
## $ LandSlope : Ord.factor w/ 4 levels "None"<"Gtl"<"Mod"<..: 2 2 2 2 2 2 2 2 2 2 ...
## $ Neighborhood : Factor w/ 25 levels "Blmngtn","Blueste",..: 6 4 6 18 22 6 11 17 20 8 ...
## $ Condition1 : Factor w/ 9 levels "Artery","Feedr",..: 3 1 3 3 5 3 3 3 3 3 ...
## $ Condition2 : Factor w/ 9 levels "Artery","Feedr",..: 3 1 3 3 3 3 3 3 3 3 ...
## $ BldgType : Factor w/ 5 levels "1Fam","2FmCon",..: 1 2 1 1 1 1 4 1 1 3 ...
## $ HouseStyle : Factor w/ 8 levels "1Story","1.5Fin",..: 4 3 1 1 1 8 4 1 2 1 ...
## $ OverallQual : Ord.factor w/ 10 levels "1"<"2"<"3"<"4"<..: 7 5 7 5 8 5 4 6 5 5 ...
## $ OverallCond : Ord.factor w/ 10 levels "1"<"2"<"3"<"4"<..: 5 6 5 6 5 8 4 7 5 5 ...
## $ YearBuilt : int 2003 1939 2006 1920 2006 1977 1970 1977 1948 1965 ...
## $ YearRemodAdd : int 2003 1950 2006 1950 2006 1977 1970 2001 1950 1965 ...
## $ RoofStyle : Factor w/ 6 levels "Flat","Gable",..: 2 2 2 2 2 2 2 2 2 4 ...
## $ RoofMatl : Factor w/ 8 levels "ClyTile","CompShg",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ Exterior1st : Factor w/ 17 levels "AsbShng","AsphShn",..: 15 9 15 16 15 7 6 11 16 2 ...
## $ Exterior2nd : Factor w/ 18 levels "AsbShng","AsphShn",..: 15 9 15 16 15 7 6 11 16 2 ...
## $ MasVnrType : Factor w/ 5 levels "BrkCmn","BrkFace",..: 2 4 5 4 2 2 4 2 4 4 ...
## $ MasVnrArea : num 196 0 24 0 53 220 0 28 0 0 ...
## $ ExterQual : Ord.factor w/ 5 levels "Po"<"Fa"<"TA"<..: 4 3 4 3 4 4 3 3 3 3 ...
## $ ExterCond : Ord.factor w/ 5 levels "Po"<"Fa"<"TA"<..: 3 3 3 3 3 3 3 3 3 3 ...
## $ Foundation : Factor w/ 6 levels "BrkTil","CBlock",..: 3 1 3 1 3 2 2 3 2 2 ...
## $ BsmtQual : Ord.factor w/ 6 levels "None"<"Po"<"Fa"<..: 5 4 5 3 5 5 4 4 4 1 ...
## $ BsmtCond : Ord.factor w/ 6 levels "None"<"Po"<"Fa"<..: 4 4 4 4 4 5 4 4 4 1 ...
## $ BsmtExposure : Ord.factor w/ 5 levels "None"<"No"<"Mn"<..: 2 2 4 2 2 4 2 3 2 1 ...
## $ BsmtFinType1 : Ord.factor w/ 7 levels "None"<"Unf"<"LwQ"<..: 7 7 7 2 7 7 5 6 2 1 ...
## $ BsmtFinSF1 : num 706 851 686 0 24 595 273 1200 0 0 ...
## $ BsmtFinType2 : Ord.factor w/ 7 levels "None"<"Unf"<"LwQ"<..: 2 2 2 2 2 2 3 2 2 1 ...
## $ BsmtFinSF2 : num 0 0 0 0 0 0 273 0 0 0 ...
## $ BsmtUnfSF : num 150 140 501 691 1550 390 0 410 720 0 ...
## $ TotalBsmtSF : num 856 991 1187 691 1574 ...
## $ Heating : Factor w/ 7 levels "None","Floor",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ HeatingQC : Ord.factor w/ 6 levels "None"<"Po"<"Fa"<..: 6 6 6 6 6 4 4 5 4 4 ...
## $ CentralAir : Ord.factor w/ 2 levels "N"<"Y": 2 2 2 2 2 2 2 2 2 1 ...
## $ Electrical : Factor w/ 6 levels "None","SBrkr",..: 2 2 2 3 2 2 2 2 2 2 ...
## $ X1stFlrSF : num 856 1077 1208 691 1574 ...
## $ X2ndFlrSF : num 854 0 0 0 0 0 546 0 551 0 ...
## $ LowQualFinSF : num 0 0 0 0 0 0 0 0 0 0 ...
## $ GrLivArea : num 1710 1077 1208 691 1574 ...
## $ BsmtFullBath : int 1 1 1 0 0 0 0 1 0 0 ...
## $ BsmtHalfBath : int 0 0 0 0 0 0 0 0 0 0 ...
## $ FullBath : int 2 1 2 1 2 2 1 2 2 2 ...
## $ HalfBath : int 1 0 0 0 0 0 1 0 0 0 ...
## $ BedroomAbvGr : int 3 2 2 2 3 3 3 3 4 4 ...
## $ KitchenAbvGr : int 1 2 1 1 1 1 1 1 1 2 ...
## $ KitchenQual : Ord.factor w/ 5 levels "Po"<"Fa"<"TA"<..: 4 3 4 5 4 3 3 4 3 3 ...
## $ TotRmsAbvGrd : int 8 5 6 4 7 6 6 6 7 8 ...
## $ Functional : Ord.factor w/ 8 levels "Sal"<"Sev"<"Maj2"<..: 8 8 8 8 8 8 8 8 8 8 ...
## $ Fireplaces : int 0 2 0 0 1 0 0 2 1 0 ...
## $ FireplaceQu : Ord.factor w/ 6 levels "None"<"Po"<"Fa"<..: 1 4 1 1 5 1 1 4 5 1 ...
## $ GarageType : Factor w/ 7 levels "2Types","Attchd",..: 2 2 2 6 2 2 2 2 2 7 ...
## $ GarageYrBlt : int 2003 1939 2006 1920 2006 1977 1970 1977 1948 NA ...
## $ GarageFinish : Ord.factor w/ 4 levels "None"<"Unf"<"RFn"<..: 3 3 3 2 3 2 3 3 2 1 ...
## $ GarageCars : int 2 1 2 1 3 1 1 2 1 0 ...
## $ GarageArea : num 548 205 632 216 824 328 286 480 312 0 ...
## $ GarageQual : Ord.factor w/ 6 levels "None"<"Po"<"Fa"<..: 4 5 4 3 4 4 4 4 4 1 ...
## $ GarageCond : Ord.factor w/ 6 levels "None"<"Po"<"Fa"<..: 4 4 4 4 4 4 4 4 4 1 ...
## $ PavedDrive : Ord.factor w/ 4 levels "None"<"N"<"P"<..: 4 4 4 2 4 4 4 4 4 4 ...
## $ WoodDeckSF : num 0 0 105 0 144 210 238 168 0 0 ...
## $ OpenPorchSF : num 61 4 61 20 104 0 0 68 0 0 ...
## $ EnclosedPorch: num 0 0 0 94 0 0 0 0 108 0 ...
## $ X3SsnPorch : num 0 0 0 0 0 0 0 0 0 0 ...
## $ ScreenPorch : num 0 0 0 0 0 0 0 0 0 0 ...
## $ PoolArea : num 0 0 0 0 0 0 0 0 0 0 ...
## $ PoolQC : Ord.factor w/ 6 levels "None"<"Po"<"Fa"<..: 1 1 1 1 1 1 1 1 1 1 ...
## $ Fence : Factor w/ 5 levels "GdPrv","MnPrv",..: 5 5 5 5 5 5 5 5 5 5 ...
## $ MiscFeature : Factor w/ 6 levels "Elev","Gar2",..: 6 6 6 6 6 6 6 6 6 6 ...
## $ MiscVal : num 0 0 0 0 0 0 0 0 0 0 ...
## $ MoSold : int 2 1 2 1 7 11 8 2 8 5 ...
## $ YrSold : int 2008 2008 2010 2007 2008 2008 2009 2010 2008 2010 ...
## $ SaleType : Factor w/ 10 levels "WD","CWD","VWD",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ SaleCondition: Factor w/ 6 levels "Normal","Abnorml",..: 1 1 1 2 1 1 1 1 1 1 ...
## $ SalePrice : num 208500 118000 206000 86000 232000 ...
There are 715 total observations in the validation training set, across 81 features (target feature “Saleprice” and id column “Id” included). 46 features are factors, 24 of which are ordered. 14 are integers. 20 are doubles, including “SalePrice”. (“Id” is integers cast as a character type.)
The full correlation grid is too large for most screens, but there are only a handful of noteworthy correlations which I’ll include with further analysis of each feature.
# ggcorr(
# select(val_train_Xy, where(is.numeric)),
# label = T,
# label_round = 2,
# label_size = 3
# )
I wrote a simple algorithm to try various transformations on each continuous feature and use the Shapiro-Wilk Normality Test to choose the best transformation. I then visualize each feature and decide if it even makes sense to attempt normalization.
I should have made logarithmic transformations be that of x+1, but instead I excluded 0s, which only partially handled the issue. 1s still convert to 0s in that case. The result was that variables with a substantial number of 1s did not find logs very useful for normalization.
I also did not include more dynamic transformations like Box-Cox. The script could be modified to include them.
For variables in which a 0 indicates a missing feature, I normalized only the non-zero set. The idea was that it might aid regression when the variable is put into interaction with its missingness.
funcs_lst = list(
'no_func' = function (x) { x },
'sqrt' = sqrt,
'cbrt' = function(x) { x^(1/3) },
'square' = function(x) { x^2 },
###
### FIXME
# Make log transformations of x+1.
###
'log' = log,
'log2' = log2,
'log10' = log10,
'1/x' = function (x) { 1/x },
'2^(1/x)' = function (x) { 2^(1/x) }
# Box Cox: write function that calls MASS::boxcox and include lambda in results/function returned.
# Yeo-Johnson
# Winsorize here?
# Rank
# Rank-Gauss
)
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
best_normalizers = find_best_normalizer_per_feat(
df = val_train_Xy,
feats_lst = num_feats,
funcs_lst = funcs_lst,
exclude_vals = list(0)
)
print("Best normalizing transformations:")
## [1] "Best normalizing transformations:"
for (feat in names(best_normalizers)) {
func_name = best_normalizers[[feat]]$best_func$name
print(
paste(
feat, ":", func_name,
", p-value:", best_normalizers[[feat]]$results[[func_name]]$p.value
)
)
}
## [1] "LotFrontage : log10 , p-value: 0.896193826437694"
## [1] "LotArea : log10 , p-value: 2.26152919548748e-17"
## [1] "YearBuilt : square , p-value: 0.0108822707033521"
## [1] "YearRemodAdd : no_func , p-value: 0.0256596409501691"
## [1] "MasVnrArea : cbrt , p-value: 0.954476102599975"
## [1] "BsmtFinSF1 : cbrt , p-value: 4.83197463170721e-05"
## [1] "BsmtFinSF2 : cbrt , p-value: 0.576669464478684"
## [1] "BsmtUnfSF : cbrt , p-value: 0.00100793900881088"
## [1] "TotalBsmtSF : log , p-value: 2.08019611846809e-06"
## [1] "X1stFlrSF : log2 , p-value: 0.0309515511380178"
## [1] "X2ndFlrSF : sqrt , p-value: 0.580431655244041"
## [1] "LowQualFinSF : sqrt , p-value: 0.12996282240508"
## [1] "GrLivArea : log2 , p-value: 0.0129611332961232"
## [1] "BsmtFullBath : , p-value: "
## [1] "BsmtHalfBath : , p-value: "
## [1] "FullBath : , p-value: "
## [1] "HalfBath : , p-value: "
## [1] "BedroomAbvGr : sqrt , p-value: 0.995915265851324"
## [1] "KitchenAbvGr : , p-value: "
## [1] "TotRmsAbvGrd : no_func , p-value: 0.972016654577295"
## [1] "Fireplaces : , p-value: "
## [1] "GarageYrBlt : square , p-value: 0.00823549702716429"
## [1] "GarageCars : no_func , p-value: 0.971877057620897"
## [1] "GarageArea : sqrt , p-value: 0.203272774347792"
## [1] "WoodDeckSF : cbrt , p-value: 0.991997742575695"
## [1] "OpenPorchSF : cbrt , p-value: 0.898709370969093"
## [1] "EnclosedPorch : no_func , p-value: 0.526291715345031"
## [1] "X3SsnPorch : sqrt , p-value: 0.516253205278421"
## [1] "ScreenPorch : cbrt , p-value: 0.608040279289975"
## [1] "PoolArea : , p-value: "
## [1] "MiscVal : log2 , p-value: 0.280656484036341"
## [1] "MoSold : no_func , p-value: 0.875731443365881"
## [1] "YrSold : no_func , p-value: 0.96717393596804"
## [1] "SalePrice : log , p-value: 0.72923438230646"
x = 'SalePrice'
summary(val_train_Xy[[x]])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 39300 129950 160000 181697 212500 745000
sum_and_trans_cont(
data = val_train_Xy,
x = x,
func = best_normalizers[[x]]$best_func$func,
func_name = best_normalizers[[x]]$best_func$name,
x_binw = 5000,
t_binw = 1/50
)
## NULL
val_train_Xy = val_train_Xy %>%
mutate('log(SalePrice)' = log(SalePrice))
x = 'log(SalePrice)'
# Recalculate best normalizers.
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
best_normalizers = find_best_normalizer_per_feat(
df = val_train_Xy,
feats_lst = num_feats,
funcs_lst = funcs_lst,
exclude_vals = list(0)
)
summary(val_train_Xy[x])
## log(SalePrice)
## Min. :10.58
## 1st Qu.:11.77
## Median :11.98
## Mean :12.03
## 3rd Qu.:12.27
## Max. :13.52
sum_and_trans_cont(
data = val_train_Xy,
x = x,
func = best_normalizers[[x]]$best_func$func,
func_name = best_normalizers[[x]]$best_func$name,
x_binw = 1/100,
t_binw = 1/100
)
## NULL
A natural log best normalizes the sale price distribution. However, because it isn’t a log10 transformation, it won’t precisely scale the prediction errors proportionally to the sale price. I could simply use the log10 instead, which does a fair job at normalizing as well, but I want to stick with the “best” transformation to make the best model. So, when I run ML, I will write a custom summary function to train with which simply divides the error by the price (the log(SalePrice)) before calculating the RMSE.
Here I’m looking for the best Winsorization quantile values of the best transformation – the best according to Shapiro-Wilk p-values. I could have programmatically explored the space and returned the best result. But, I want to visualize it and explore the process itself. In future projects, I might choose to further automate this.
It should be noted that Winsorization of the target variable should only be used for training, not for testing. A log transformation can be reversed as a vectorized operation, but Winsorization can’t. Winsorization should improve the accuracy of the model, but would be cheating on the test.
qqnorm(y = val_train_Xy$SalePrice, ylab = 'SalePrice')
qqline(y = val_train_Xy$SalePrice, ylab = 'SalePrice')
qqnorm(y = val_train_Xy$`log(SalePrice)`, ylab = 'log(SalePrice)')
qqline(y = val_train_Xy$`log(SalePrice)`, ylab = 'log(SalePrice)')
Win_log_x = Winsorize(
x = val_train_Xy[['log(SalePrice)']],
probs = c(0.005, 0.995)
)
qqnorm(y = Win_log_x, ylab = 'Win_log_x')
qqline(y = Win_log_x, ylab = 'Win_log_x')
Win_raw_x = Winsorize(
x = val_train_Xy[['SalePrice']],
probs = c(0, 0.95)
)
qqnorm(y = Win_raw_x, ylab = 'Win_raw_x')
qqline(y = Win_raw_x, ylab = 'Win_raw_x')
print(shapiro.test(x = val_train_Xy$SalePrice))
##
## Shapiro-Wilk normality test
##
## data: val_train_Xy$SalePrice
## W = 0.86089, p-value < 2.2e-16
print(shapiro.test(x = val_train_Xy$`log(SalePrice)`))
##
## Shapiro-Wilk normality test
##
## data: val_train_Xy$`log(SalePrice)`
## W = 0.9904, p-value = 0.0001299
print(shapiro.test(x = Win_log_x))
##
## Shapiro-Wilk normality test
##
## data: Win_log_x
## W = 0.99062, p-value = 0.0001609
print(shapiro.test(x = Win_raw_x))
##
## Shapiro-Wilk normality test
##
## data: Win_raw_x
## W = 0.93275, p-value < 2.2e-16
A small Winsorization of the log best normalizes the variable (W = 0.99062). It doesn’t pass the test for normality (p < 0.01), but it is still better prepared for a linear regression.
val_train_Xy = val_train_Xy %>%
mutate(
'Win(SalePrice)' = Winsorize(
SalePrice,
probs = c(0, 0.95),
na.rm = T
)
) %>%
mutate(
'Win(log(SalePrice))' = Winsorize(
log(SalePrice),
probs = c(0.005, 0.995),
na.rm = T
)
)
Here are the correlations between SalePrice and the rest of the variables, compared to those of the transformed variables. Transforming SalePrice resulted in minor increases of correlations to many other continuous features and some minor reductions of correlations, an overall minor and insignificant improvement. But, this is the first of the variables to be transformed.
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
x = 'Win(log(SalePrice))'
x_lst = c('SalePrice', 'log(SalePrice)', 'Win(log(SalePrice))',
'Win(SalePrice)')
df = get_cors(
data = filter(
select(val_train_Xy, all_of(num_feats)),
!is.na(.data[[x]])
),
x_lst = x_lst,
feats = num_feats
)
df
print("Summary of absolute values of Pearson's Rs:")
## [1] "Summary of absolute values of Pearson's Rs:"
df = abs(df)
summary(abs(df))
## SalePrice log(SalePrice) Win(log(SalePrice)) Win(SalePrice)
## Min. :0.007746 Min. :0.001275 Min. :0.001406 Min. :0.006338
## 1st Qu.:0.124877 1st Qu.:0.130580 1st Qu.:0.131486 1st Qu.:0.116871
## Median :0.306100 Median :0.314440 Median :0.314346 Median :0.312263
## Mean :0.310147 Mean :0.322002 Mean :0.322153 Mean :0.317893
## 3rd Qu.:0.520650 3rd Qu.:0.538367 3rd Qu.:0.539042 3rd Qu.:0.515569
## Max. :0.689569 Max. :0.687804 Max. :0.686782 Max. :0.686731
df = melt(df)
ggplot(df, aes(x = variable, y = value)) +
geom_boxplot(notch = T) +
ylab(label = 'Absolute Value of Correlation to Other Features')
I’ll need to hard code the top and bottom limits for the engineering script to apply to the test set without leakage. And, I’ll need to drop Win(SalePrice).
I want to keep the transformed and non-Winsorized version though. In the case of the target variable, I’ll train on the Winsorized variable and test on the transformed because I can reverse the transformation. In the case of predictor variables, the Winsorized version will be good for basic linear regression without interactions, but not necessarily for KNN and RF which may be able to use the outliers for clustering and grouping.
x = 'Win(log(SalePrice))'
min_val = min(val_train_Xy[[x]])
max_val = max(val_train_Xy[[x]])
print(paste("min_val:", min_val))
## [1] "min_val: 10.9579480025541"
print(paste("max_val:", max_val))
## [1] "max_val: 13.2279465702719"
val_train_Xy = val_train_Xy %>%
mutate(
'Win(log(SalePrice))' = Winsorize(
.data[['log(SalePrice)']],
minval = min_val,
maxval = max_val
)
) %>%
select(-c('Win(SalePrice)'))
gg = ggplot(val_train_Xy, aes(x = .data[[x]]))
p1 = gg + geom_histogram(binwidth = 1/50)
p2 = gg + geom_boxplot(notch = T)
grid.arrange(p1, p2)
To aid visualization, I’ll create a SalePrice factor with extremes and quartiles as levels.
val_train_Xy = val_train_Xy %>%
mutate(
'SalePrice.fact' = cut(
x = SalePrice,
breaks = quantile(x = SalePrice),
include.lowest = T,
ordered_result = T
)
)
summary(val_train_Xy$SalePrice.fact)
## [3.93e+04,1.3e+05] (1.3e+05,1.6e+05] (1.6e+05,2.12e+05] (2.12e+05,7.45e+05]
## 179 179 178 179
I’ll use log(SalePrice) to visualize against factors, rather than the Winsorized version.
x = 'MSSubClass'
y = 'SalePrice'
summarize_by(data = val_train_Xy, x = x, y = y)
y = 'log(SalePrice)'
sum_and_trans_fact(data = val_train_Xy, x = x, y = y)
## NULL
p_vals = get_signif_levels(data = val_train_Xy, x = x, z = y)
heatmap.2(
x = as.matrix(p_vals$pval_df),
scale = 'none',
Rowv = F,
Colv = F,
dendrogram = 'none',
cellnote = format(p_vals$pval_df, digits = 2),
notecex = 0.75,
notecol = 'black',
main = paste(y, 'p-values'),
key = F
)
print(
paste(
"Levels w/ significantly different",
y,
"than another level:"
)
)
## [1] "Levels w/ significantly different log(SalePrice) than another level:"
print(p_vals$signif_levels)
## [1] "60" "20" "30" "80" "160" "50" "70" "120"
The most common class is 1-story newer than 1945 (267), followed by 2-story newer than 1945 (149) and 1.5-story finished all ages (67). The priciest class is 2-story newer than 1945, though some classes are so uncommon that it’s hard to say this completely confidently.
This feature is a mix of information mostly covered by HouseStyle, YearBuilt, and square footage. It might be worth dropping it to avoid overweighting this info, avoid spurious fit, and skip the compute cost of 16 one-hot features. Alternatively, decomposition with PCA might help pull out the unique information regarding PUD housing.
x = 'MSZoning'
y = 'SalePrice'
summarize_by(data = val_train_Xy, x = x, y = y)
y = 'log(SalePrice)'
sum_and_trans_fact(data = val_train_Xy, x = x, y = y)
## NULL
p_vals = get_signif_levels(data = val_train_Xy, x = x, z = y, min_n = 30)
heatmap.2(
x = as.matrix(p_vals$pval_df),
scale = 'none',
Rowv = F,
Colv = F,
dendrogram = 'none',
cellnote = format(p_vals$pval_df, digits = 2),
notecex = 0.75,
notecol = 'black',
main = paste(y, 'p-values'),
key = F
)
print(
paste(
"Levels w/ significantly different",
y,
"than another level:"
)
)
## [1] "Levels w/ significantly different log(SalePrice) than another level:"
print(p_vals$signif_levels)
## [1] "RL" "RM" "FV"
Mostly residential low density (574), some medium density (100), fewer floating village residential (flexible zoning, 31). Predictive power may be limited due to lack of diversity. That said, low-density residential and floating village clearly tend to sell for more than medium-density. Consider only one-hot encoding RL, RM, and FV?
x = 'LotFrontage'
summary(val_train_Xy[[x]])
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 21.00 58.00 70.00 70.04 80.00 313.00 133
sum_and_trans_cont(
data = val_train_Xy,
x = x,
func = best_normalizers[[x]]$best_func$func,
func_name = best_normalizers[[x]]$best_func$name,
x_binw = 5,
t_binw = 1/50
)
## NULL
A log10 scale centers it better (133 missing values excluded).
Note the extreme spike (~65 observations) in left of mode (70-75 SF) at 60 SF. It doesn’t seem to be associated with any particular neighborhood or lot configuration or anything, but probably just a common way to cut lots.
The feature could benefit from top/bottom coding.
133 NAs. Counterintuitively, a lower proportion of missing LotFrontages are inside lots (55/121 in the NA subset vs. 511/715 in the training set [these numbers are from a previous split and not accurate for the current data set]), whereas many lots that by definition have frontage (44 corner lots, FR2, and FR3) are missing frontage values. Could use LotArea, LotShape, LotConfig, and (?) (all of which aren’t missing values) to multivariate impute.
x = 'log10(LotFrontage)'
val_train_Xy = val_train_Xy %>%
mutate(
'log10(LotFrontage)' = ifelse(
LotFrontage == 0,
0,
log10(LotFrontage)
)
)
# Recalculate best normalizers.
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
best_normalizers = find_best_normalizer_per_feat(
df = val_train_Xy,
feats_lst = num_feats,
funcs_lst = funcs_lst,
exclude_vals = list(0)
)
summary(val_train_Xy[x])
## log10(LotFrontage)
## Min. :1.322
## 1st Qu.:1.763
## Median :1.845
## Mean :1.820
## 3rd Qu.:1.903
## Max. :2.496
## NA's :133
sum_and_trans_cont(
data = val_train_Xy,
x = x,
func = best_normalizers[[x]]$best_func$func,
func_name = best_normalizers[[x]]$best_func$name,
x_binw = 1/50,
t_binw = 1/50
)
## NULL
qqnorm(y = val_train_Xy$LotFrontage, ylab = 'LotFrontage')
qqline(y = val_train_Xy$LotFrontage, ylab = 'LotFrontage')
qqnorm(y = val_train_Xy[[x]], ylab = x)
qqline(y = val_train_Xy[[x]], ylab = x)
Win_log10_x = Winsorize(
x = val_train_Xy[[x]],
probs = c(0.05, 0.99),
na.rm = T
)
qqnorm(y = Win_log10_x, ylab = 'Win_log10_x')
qqline(y = Win_log10_x, ylab = 'Win_log10_x')
Win_raw_x = Winsorize(
x = val_train_Xy$LotFrontage,
probs = c(0.05, 0.95),
na.rm = T
)
qqnorm(y = Win_raw_x, ylab = 'Win(LotFrontage)')
qqline(y = Win_raw_x, ylab = 'Win(LotFrontage)')
print(shapiro.test(x = val_train_Xy$LotFrontage))
##
## Shapiro-Wilk normality test
##
## data: val_train_Xy$LotFrontage
## W = 0.87621, p-value < 2.2e-16
print(shapiro.test(x = val_train_Xy[[x]]))
##
## Shapiro-Wilk normality test
##
## data: val_train_Xy[[x]]
## W = 0.94238, p-value = 3.051e-14
print(shapiro.test(x = Win_log10_x))
##
## Shapiro-Wilk normality test
##
## data: Win_log10_x
## W = 0.97393, p-value = 1.16e-08
print(shapiro.test(x = Win_raw_x))
##
## Shapiro-Wilk normality test
##
## data: Win_raw_x
## W = 0.9771, p-value = 6.718e-08
It looks like just Winsorizing the raw variable may be the way to go here.
val_train_Xy = val_train_Xy %>%
mutate(
'Win(LotFrontage)' = Winsorize(
LotFrontage,
probs = c(0.05, 0.95),
na.rm = T
)
) %>%
mutate(
'Win(log10(LotFrontage))' = Winsorize(
log10(LotFrontage),
probs = c(0.05, 0.99),
na.rm = T
)
)
x = 'Win(LotFrontage)'
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
x_lst = c('LotFrontage', 'log10(LotFrontage)', 'Win(log10(LotFrontage))', 'Win(LotFrontage)')
df = get_cors(
data = filter(
select(val_train_Xy, all_of(num_feats)),
!is.na(.data[[x]])
),
x_lst = x_lst,
feats = num_feats
)
df
print("Summary of absolute values of Pearson's Rs:")
## [1] "Summary of absolute values of Pearson's Rs:"
df = abs(df)
summary(abs(df))
## LotFrontage log10(LotFrontage) Win(log10(LotFrontage))
## Min. :0.001316 Min. :0.002122 Min. :0.002136
## 1st Qu.:0.052789 1st Qu.:0.036725 1st Qu.:0.036057
## Median :0.152575 Median :0.123545 Median :0.121036
## Mean :0.189170 Mean :0.163383 Mean :0.162507
## 3rd Qu.:0.318210 3rd Qu.:0.285571 3rd Qu.:0.300117
## Max. :0.515867 Max. :0.466945 Max. :0.430664
## Win(LotFrontage)
## Min. :0.00368
## 1st Qu.:0.05062
## Median :0.13177
## Mean :0.17398
## 3rd Qu.:0.31994
## Max. :0.44127
df = melt(df)
ggplot(df, aes(x = variable, y = value)) +
geom_boxplot(notch = T) +
ylab(label = 'Absolute Value of Correlation to Other Features')
y_lst = c('log(SalePrice)')
for (feat in x_lst) {
plot_scat_pairs(df = val_train_Xy, x = feat, y_lst = y_lst)
}
x = 'Win(LotFrontage)'
min_val = min(val_train_Xy[!is.na(val_train_Xy[[x]]), x])
max_val = max(val_train_Xy[!is.na(val_train_Xy[[x]]), x])
print(paste("min_val:", min_val))
## [1] "min_val: 34.05"
print(paste("max_val:", max_val))
## [1] "max_val: 108"
val_train_Xy = val_train_Xy %>%
mutate(
'Win(LotFrontage)' = Winsorize(
LotFrontage,
minval = min_val,
maxval = max_val
)
)
gg = ggplot(val_train_Xy, aes(x = .data[[x]]))
p1 = gg + geom_histogram(binwidth = 1)
p2 = gg + geom_boxplot(notch = T)
grid.arrange(p1, p2)
y_lst = c('MSSubClass', 'MSZoning', 'LotShape', 'LotConfig', 'Neighborhood',
'BldgType', 'HouseStyle')
for (y in y_lst) {
plt = fenced_jbv(
data = val_train_Xy, x = y, y = 'log10(LotFrontage)') +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
print(plt)
}
Overall, the clusters of lots at 60’ and 80’ is quite apparent.
Unsurprisingly, low-density residential tends to have more lot frontage than medium-density residential. Slightly irregular lots might tend to have more frontage than regular, but we can’t say that with much confidence. Corner lots have more frontage than inside lots. There’s quite a bit of variation between neighborhoods.
Looking at MSSubClass, older homes tend to have less lot frontage than their equivalent house styles after 1945, unless they are PUD homes, which tend to have much less frontage than the rest of the classes. This connection is not visible in YearBuilt, which has no correlation to LotFrontage.
There’s also an interesting gap between 50’ and 60’ that only homes older than 1945 tend to fill, except for PUD homes.
fenced_jbv(
data = val_train_Xy,
x = 'MSSubClass',
y = 'log10(LotFrontage)',
jit_col = 'SalePrice.fact',
leg_lbl = 'SalePrice',
jit_alpha = 0.5,
box_color = 'red'
)
ggplot(val_train_Xy, aes(x = `log10(LotFrontage)`, y = `log(SalePrice)`)) +
geom_point(alpha = 0.5) +
geom_smooth(method = 'lm') +
facet_wrap(vars(MSSubClass), ncol = 5)
x = 'LotArea'
summary(val_train_Xy[[x]])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1596 7610 9477 10346 11611 115149
sum_and_trans_cont(
data = val_train_Xy,
x = x,
func = best_normalizers[[x]]$best_func$func,
func_name = best_normalizers[[x]]$best_func$name,
x_binw = 200,
t_binw = 1/50
)
## NULL
x_trans = 'log10(LotArea)'
val_train_Xy = val_train_Xy %>%
mutate('log10(LotArea)' = log10(LotArea))
# Recalculate best normalizers.
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
best_normalizers = find_best_normalizer_per_feat(
df = val_train_Xy,
feats_lst = num_feats,
funcs_lst = funcs_lst,
exclude_vals = list(0)
)
summary(val_train_Xy[x_trans])
## log10(LotArea)
## Min. :3.203
## 1st Qu.:3.881
## Median :3.977
## Mean :3.960
## 3rd Qu.:4.065
## Max. :5.061
sum_and_trans_cont(
data = val_train_Xy,
x = x_trans,
func = best_normalizers[[x]]$best_func$func,
func_name = best_normalizers[[x]]$best_func$name,
x_binw = 1/50,
t_binw = 1/500
)
## NULL
x_trans = 'log10(log10(LotArea))'
val_train_Xy = val_train_Xy %>%
mutate('log10(log10(LotArea))' = log10(log10(LotArea)))
# Recalculate best normalizers.
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
best_normalizers = find_best_normalizer_per_feat(
df = val_train_Xy,
feats_lst = num_feats,
funcs_lst = funcs_lst,
exclude_vals = list(0)
)
summary(val_train_Xy[x_trans])
## log10(log10(LotArea))
## Min. :0.5056
## 1st Qu.:0.5890
## Median :0.5995
## Mean :0.5970
## 3rd Qu.:0.6090
## Max. :0.7043
sum_and_trans_cont(
data = val_train_Xy,
x = x_trans,
func = best_normalizers[[x]]$best_func$func,
func_name = best_normalizers[[x]]$best_func$name,
x_binw = 1/500,
t_binw = 1/750
)
## NULL
I doubt it’s worth doing the third log10 transformation now that the median and mean are so close. It still needs top- and bottom-coding anyway.
Even the second transformation might lead to overfit, but I’ll roll with it.
qqnorm(y = val_train_Xy$LotArea, ylab = 'LotArea')
qqline(y = val_train_Xy$LotArea, ylab = 'LotArea')
qqnorm(y = val_train_Xy$`log10(LotArea)`, ylab = 'log10(LotArea)')
qqline(y = val_train_Xy$`log10(LotArea)`, ylab = 'log10(LotArea)')
qqnorm(
y = val_train_Xy$`log10(log10(LotArea))`,
ylab = 'log10(log10(LotArea))'
)
qqline(
y = val_train_Xy$`log10(log10(LotArea))`,
ylab = 'log10(log10(LotArea))'
)
Win_log10log10_x = Winsorize(
x = val_train_Xy$`log10(log10(LotArea))`,
probs = c(0.05, 0.99),
na.rm = T
)
qqnorm(y = Win_log10log10_x, ylab = 'Win(log10(log10(LotArea)))')
qqline(y = Win_log10log10_x, ylab = 'Win(log10(log10(LotArea)))')
Win_log10_x = Winsorize(
x = val_train_Xy$`log10(LotArea)`,
probs = c(0.05, 0.99),
na.rm = T
)
qqnorm(y = Win_log10_x, ylab = 'Win(log10(LotArea))')
qqline(y = Win_log10_x, ylab = 'Win(log10(LotArea))')
Win_raw_x = Winsorize(
x = val_train_Xy$LotArea,
probs = c(0.01, 0.95),
na.rm = T
)
qqnorm(y = Win_raw_x, ylab = 'LotArea')
qqline(y = Win_raw_x, ylab = 'LotArea')
print(shapiro.test(x = val_train_Xy$LotArea))
##
## Shapiro-Wilk normality test
##
## data: val_train_Xy$LotArea
## W = 0.5211, p-value < 2.2e-16
print(shapiro.test(x = val_train_Xy$`log10(LotArea)`))
##
## Shapiro-Wilk normality test
##
## data: val_train_Xy$`log10(LotArea)`
## W = 0.9113, p-value < 2.2e-16
print(shapiro.test(x = val_train_Xy$`log10(log10(LotArea))`))
##
## Shapiro-Wilk normality test
##
## data: val_train_Xy$`log10(log10(LotArea))`
## W = 0.90238, p-value < 2.2e-16
print(shapiro.test(x = Win_log10log10_x))
##
## Shapiro-Wilk normality test
##
## data: Win_log10log10_x
## W = 0.94232, p-value = 4.825e-16
print(shapiro.test(x = Win_log10_x))
##
## Shapiro-Wilk normality test
##
## data: Win_log10_x
## W = 0.94434, p-value = 9.78e-16
print(shapiro.test(x = Win_raw_x))
##
## Shapiro-Wilk normality test
##
## data: Win_raw_x
## W = 0.97759, p-value = 5.239e-09
It looks like simply Winsorizing the base variable might be best.
val_train_Xy = val_train_Xy %>%
mutate(
'Win(LotArea)' = Winsorize(
LotArea,
probs = c(0.01, 0.95),
na.rm = T
)
) %>%
mutate(
'Win(log10(LotArea))' = Winsorize(
log10(LotArea),
probs = c(0.05, 0.99),
na.rm = T
)
)
Transforming LotArea with log10 resulted in bigger swings in r in both directions, but no real change in aggregate. The additional log10 transformation produced minor changes in correlation compared to the initial transformation, mostly toward less correlation.
Unsurprisingly, transformed LotArea is much more correlated to transformed LotFrontage than their untransformed counterparts (r went from 0.51 to 0.73). Also, transformed LotArea and transformed SalePrice correlate a little better than their untransformed counterparts, but are still weakly correlated (r increased from 0.30 to 0.37).
It also became noticeably more correlated to square footage of the first floor, bedrooms / total rooms above ground, and garage area/cars. Like previous transformations, some correlations lessened with this transformation, but none so much that the correlation dropped a bracket, e.g. from weak to insignificant.
The distribution of correlations dropped with the second transformation. I’ll only use the first transformation in the engineering script.
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
x_lst = c('LotArea', 'log10(LotArea)', 'log10(log10(LotArea))', 'Win(log10(LotArea))', 'Win(LotArea)')
x = 'Win(LotArea)'
df = get_cors(
data = filter(
select(val_train_Xy, all_of(num_feats)),
!is.na(.data[[x]])
),
x_lst = x_lst,
feats = num_feats
)
df
print("Summary of absolute values of Pearson's Rs:")
## [1] "Summary of absolute values of Pearson's Rs:"
df = abs(df)
summary(df)
## LotArea log10(LotArea) log10(log10(LotArea))
## Min. :0.0008156 Min. :0.001463 Min. :0.001822
## 1st Qu.:0.0298980 1st Qu.:0.038420 1st Qu.:0.038591
## Median :0.1411549 Median :0.128146 Median :0.125858
## Mean :0.1692131 Mean :0.210784 Mean :0.208681
## 3rd Qu.:0.2935533 3rd Qu.:0.351718 3rd Qu.:0.343608
## Max. :0.5149010 Max. :0.728610 Max. :0.741522
## Win(log10(LotArea)) Win(LotArea)
## Min. :0.005534 Min. :0.0001219
## 1st Qu.:0.045043 1st Qu.:0.0704055
## Median :0.124050 Median :0.1270071
## Mean :0.216047 Mean :0.2244330
## 3rd Qu.:0.359245 3rd Qu.:0.3686740
## Max. :0.697739 Max. :0.6861456
df = melt(df)
ggplot(df, aes(x = variable, y = value)) +
geom_boxplot(notch = T) +
ylab(label = 'Absolute Value of Correlation to Other Features')
y_lst = c('log(SalePrice)')
for (feat in x_lst) {
plot_scat_pairs(df = val_train_Xy, x = feat, y_lst = y_lst)
}
x = 'Win(LotArea)'
min_val = min(val_train_Xy[!is.na(val_train_Xy[[x]]), x])
max_val = max(val_train_Xy[!is.na(val_train_Xy[[x]]), x])
print(paste("min_val:", min_val))
## [1] "min_val: 1975.96"
print(paste("max_val:", max_val))
## [1] "max_val: 16946.4"
val_train_Xy = val_train_Xy %>%
mutate(
'Win(LotArea)' = Winsorize(
LotArea,
minval = min_val,
maxval = max_val
)
) %>%
select(-c('log10(LotArea)', 'Win(log10(LotArea))'))
gg = ggplot(val_train_Xy, aes(x = .data[[x]]))
p1 = gg + geom_histogram(binwidth = 100)
p2 = gg + geom_boxplot(notch = T)
grid.arrange(p1, p2)
y_lst = c('log(SalePrice)', 'Win(LotFrontage)', 'X1stFlrSF',
'GrLivArea','TotRmsAbvGrd', 'GarageArea')
x = 'Win(LotArea)'
plot_scat_pairs(df = val_train_Xy, x = x, y_lst = y_lst)
## NULL
y_lst = c('MSSubClass', 'MSZoning', 'LotShape', 'LotConfig', 'Neighborhood',
'BldgType', 'HouseStyle')
for (y in y_lst) {
plt = fenced_jbv(
data = val_train_Xy,
x = y,
y = 'log10(log10(LotArea))',
jit_h = 0 # Again R randomly decides to go wonk unless I enter the default.
) +
theme(axis.text.x = element_text(angle = 45, hjust=1))
print(plt)
}
There are similar patterns as in LotFrontage, with a more marked upward trend against LotShape, and with less difference between lot configurations. There’s also an interesting pocket of two-story houses with low lot area; it’s not worth plotting, but you can see which neighborhoods these are.
Really lopsided to paved (3 gravel, 712 paved). Drop this feature.
summary(val_train_Xy$Street)
## None Grvl Pave
## 0 3 712
val_train_Xy = select(val_train_Xy, -c('Street'))
Vast majority have none, drop it.
summary(val_train_Xy$Alley)
## None Grvl Pave
## 669 24 22
val_train_Xy = select(val_train_Xy, -c('Alley'))
x = 'LotShape'
y = 'SalePrice'
summarize_by(data = val_train_Xy, x = x, y = y)
y = 'log(SalePrice)'
sum_and_trans_fact(data = val_train_Xy, x = x, y = y)
## NULL
p_vals = get_signif_levels(data = val_train_Xy, x = x, z = y)
heatmap.2(
x = as.matrix(p_vals$pval_df),
scale = 'none',
Rowv = F,
Colv = F,
dendrogram = 'none',
cellnote = format(p_vals$pval_df, digits = 2),
notecex = 0.75,
notecol = 'black',
main = paste(y, 'p-values'),
key = F
)
print(
paste(
"Levels w/ significantly different",
y,
"than another level:"
)
)
## [1] "Levels w/ significantly different log(SalePrice) than another level:"
print(p_vals$signif_levels)
## [1] "Reg" "IR1"
Irregularly shaped lots tend to sell for more. Good candidate for binarization if doing a basic linear regression with no interactions; one-hot encode ‘Reg’ and drop the rest of the levels.
summary(val_train_Xy$LandContour)
## Lvl Bnk HLS Low
## 642 33 24 16
val_train_Xy = select(val_train_Xy, -c('LandContour'))
All all-public. Definitely drop.
summary(val_train_Xy$Utilities)
## None ELO NoSeWa NoSewr AllPub
## 0 0 0 0 715
val_train_Xy = select(val_train_Xy, -c('Utilities'))
x = 'LotConfig'
y = 'SalePrice'
summarize_by(data = val_train_Xy, x = x, y = y)
y = 'log(SalePrice)'
sum_and_trans_fact(data = val_train_Xy, x = x, y = y)
## NULL
p_vals = get_signif_levels(data = val_train_Xy, x = x, z = y)
heatmap.2(
x = as.matrix(p_vals$pval_df),
scale = 'none',
Rowv = F,
Colv = F,
dendrogram = 'none',
cellnote = format(p_vals$pval_df, digits = 2),
notecex = 0.75,
notecol = 'black',
main = paste(y, 'p-values'),
key = F
)
print(
paste(
"Levels w/ significantly different",
y,
"than another level:"
)
)
## [1] "Levels w/ significantly different log(SalePrice) than another level:"
print(p_vals$signif_levels)
## [1] "Inside" "Corner" "CulDSac"
This set is mostly inside lots (513) but plenty of corner lots (124). I’ve always thought corner lots are prized, but it doesn’t seem to significantly add to price compared to an inside lot.
Cul de sacs are the priciest. This is somewhat a proxy for neighborhood (and maybe other features) as it is true across most neighborhoods except those that are already pricey (where cul de sacs are relatively more common) or least pricey (where cul de sacs don’t typically exist).
ggplot(
data = val_train_Xy,
# data = filter(
# val_train_Xy,
# LotConfig %in% c('Inside', 'Corner', 'CulDSac')
# ),
mapping = aes(
x = Neighborhood,
y = `log(SalePrice)`,
)
) +
geom_jitter(
alpha = .4,
mapping = aes(color = LotConfig, shape = LotConfig)
) +
geom_boxplot(notch = T, varwidth = T, alpha = 0) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
ggplot(
data = val_train_Xy,
mapping = aes(
x = Neighborhood,
y = `log(SalePrice)`,
)
) +
geom_jitter(
alpha = .3,
mapping = aes(color = LotShape, shape = LotShape)
) +
geom_boxplot(notch = T, varwidth = T, alpha = 0) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
ggplot(
data = val_train_Xy,
mapping = aes(
x = LotConfig,
y = `log(SalePrice)`,
)
) +
geom_jitter(
alpha = .4,
mapping = aes(color = LotShape, shape = LotShape)
) +
geom_boxplot(notch = T, varwidth = T, alpha = 0)
ggplot(
data = val_train_Xy,
mapping = aes(
x = LotShape,
y = `log(SalePrice)`,
)
) +
geom_jitter(
alpha = .4,
mapping = aes(color = LotConfig, shape = LotConfig)
) +
geom_boxplot(notch = T, varwidth = T, alpha = 0)
There appears to be some overlap with lot shape and lot configuration.
Vast majority are gentle drop it.
summary(val_train_Xy$LandSlope)
## None Gtl Mod Sev
## 0 677 32 6
val_train_Xy = select(val_train_Xy, -c('LandSlope'))
x = 'Neighborhood'
y = 'SalePrice'
summarize_by(data = val_train_Xy, x = x, y = y)
y = 'log(SalePrice)'
sum_and_trans_fact(data = val_train_Xy, x = x, y = y)
## NULL
p_vals = get_signif_levels(data = val_train_Xy, x = x, z = y)
heatmap.2(
x = as.matrix(p_vals$pval_df),
scale = 'none',
Rowv = F,
Colv = F,
dendrogram = 'none',
cellnote = format(p_vals$pval_df, digits = 2),
notecex = 0.75,
notecol = 'black',
main = paste(y, 'p-values'),
keysize = 2
)
print(
paste(
"Levels w/ significantly different",
y,
"than another level:"
)
)
## [1] "Levels w/ significantly different log(SalePrice) than another level:"
print(p_vals$signif_levels)
## [1] "CollgCr" "BrkSide" "OldTown" "Somerst" "NWAmes" "Sawyer" "Edwards"
## [8] "Crawfor" "NridgHt" "Gilbert" "Names"
North Ames (Names) and CollgCr have the most residential homes (107 and 76). but they’re also zoned low-density. A handful of neighborhoods have almost no houses in this set.
The priciest neighborhoods are StoneBr, NridgeHt, and NoRidge. The least pricey (OldTown, MeadowV, and IDOTRR) are also the most dense and commercial. There are significant differences in prices between many neighborhoods, and not just between the cheapest and priciest.
ggplot(
data = val_train_Xy,
aes(x = Neighborhood, fill = MSZoning)
) +
geom_bar() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
The vast majority are normal. Drop Condition2. But, for Condition1, there appear to be significant differences in SalePrice between Norm and Feedr, and maybe between Norm and Artery but there are too few Artery observations. It might be worth one-hot-encoding those categories.
x = 'Condition1'
y = 'SalePrice'
summarize_by(data = val_train_Xy, x = x, y = y)
y = 'log(SalePrice)'
sum_and_trans_fact(data = val_train_Xy, x = x, y = y)
## NULL
p_vals = get_signif_levels(data = val_train_Xy, x = x, z = y)
heatmap.2(
x = as.matrix(p_vals$pval_df),
scale = 'none',
Rowv = F,
Colv = F,
dendrogram = 'none',
cellnote = format(p_vals$pval_df, digits = 2),
notecex = 0.75,
notecol = 'black',
main = paste(y, 'p-values'),
key = F
)
print(
paste(
"Levels w/ significantly different",
y,
"than another level:"
)
)
## [1] "Levels w/ significantly different log(SalePrice) than another level:"
print(p_vals$signif_levels)
## [1] "Norm" "Feedr"
You might consider binarizing, lumping ‘Feedr’ and ‘Artery’ and maybe ‘RRAe’ together and lumping the rest with ‘Norm’. I’ll try it out during feature selection with caret in the ML phase.
summary(val_train_Xy$Condition2)
## Artery Feedr Norm RRNn RRAn PosN PosA RRNe RRAe
## 1 2 709 1 0 0 1 0 1
val_train_Xy = select(val_train_Xy, -c('Condition2'))
x = 'BldgType'
y = 'SalePrice'
summarize_by(data = val_train_Xy, x = x, y = y)
y = 'log(SalePrice)'
sum_and_trans_fact(data = val_train_Xy, x = x, y = y)
## NULL
p_vals = get_signif_levels(data = val_train_Xy, x = x, z = y)
heatmap.2(
x = as.matrix(p_vals$pval_df),
scale = 'none',
Rowv = F,
Colv = F,
dendrogram = 'none',
cellnote = format(p_vals$pval_df, digits = 2),
notecex = 0.75,
notecol = 'black',
main = paste(y, 'p-values'),
key = F
)
print(
paste(
"Levels w/ significantly different",
y,
"than another level:"
)
)
## [1] "Levels w/ significantly different log(SalePrice) than another level:"
print(p_vals$signif_levels)
## character(0)
Majority single-family (603), 24 NAs. It seems like an inherently important feature, despite the lopsidedness of the distribution. Probably safe to impute to mode (1Family), but other features might inform, such as MSSubClass, MSZoning, Neighborhood, HouseStyle, and building materials; multivariate impute might be in order, if keeping the feature in the first place.
Duplexes and two-family are significantly cheaper than single-family and townhouses, if accepting the low number in the sample. Candidate for binarization, but may lose interactions.
x = 'HouseStyle'
y = 'SalePrice'
summarize_by(data = val_train_Xy, x = x, y = y)
y = 'log(SalePrice)'
sum_and_trans_fact(data = val_train_Xy, x = x, y = y)
## NULL
p_vals = get_signif_levels(data = val_train_Xy, x = x, z = y, min_n = 29)
heatmap.2(
x = as.matrix(p_vals$pval_df),
scale = 'none',
Rowv = F,
Colv = F,
dendrogram = 'none',
cellnote = format(p_vals$pval_df, digits = 2),
notecex = 0.75,
notecol = 'black',
main = paste(y, 'p-values'),
key = F
)
print(
paste(
"Levels w/ significantly different",
y,
"than another level:"
)
)
## [1] "Levels w/ significantly different log(SalePrice) than another level:"
print(p_vals$signif_levels)
## [1] "2Story" "1Story" "SLvl" "1.5Fin"
Mostly one-story (358), but many two-story (220). Several significant price differences across groups. Maybe worth keeping, but also kind of noisy with the finished/unfinished business only applying to half-stories, and number of stories and finished status are encoded in other features.
x = 'OverallQual'
y = 'SalePrice'
summarize_by(data = val_train_Xy, x = x, y = y)
y = 'log(SalePrice)'
sum_and_trans_fact(data = val_train_Xy, x = x, y = y)
## NULL
p_vals = get_signif_levels(data = val_train_Xy, x = x, z = y, min_n = 29)
heatmap.2(
x = as.matrix(p_vals$pval_df),
scale = 'none',
Rowv = F,
Colv = F,
dendrogram = 'none',
cellnote = format(p_vals$pval_df, digits = 2),
notecex = 0.75,
notecol = 'black',
main = paste(y, 'p-values'),
key = F
)
print(
paste(
"Levels w/ significantly different",
y,
"than another level:"
)
)
## [1] "Levels w/ significantly different log(SalePrice) than another level:"
print(p_vals$signif_levels)
## [1] "7" "5" "8" "4" "6"
There’s a very strong relationship to SalePrice.
It’s a pretty normal distribution, slightly left-skewed. Mode (2199 5s) left of median/mean (180 6s), few 1s, 2s, and 3s. It might be worth casting as an integer and transforming to normalize and possibly improve the correlation.
x = 'OverallQual_int'
val_train_Xy = val_train_Xy %>%
mutate(OverallQual_int = as.integer(OverallQual))
# Recalculate best normalizers.
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
best_normalizers = find_best_normalizer_per_feat(
df = val_train_Xy,
feats_lst = num_feats,
funcs_lst = funcs_lst,
exclude_vals = list(0)
)
summary(val_train_Xy[x])
## OverallQual_int
## Min. : 1.000
## 1st Qu.: 5.000
## Median : 6.000
## Mean : 6.092
## 3rd Qu.: 7.000
## Max. :10.000
sum_and_trans_cont(
data = val_train_Xy,
x = x,
func = best_normalizers[[x]]$best_func$func,
func_name = best_normalizers[[x]]$best_func$name,
x_binw = 1,
t_binw = 1
)
## NULL
None of the transformations improved its distribution.
Here are the correlations between OverallQual_int and the rest of the variables.
This feature has several moderate correlations to features having to do with size and age. It also has a strong correlation to transformed SalePrice (0.82).
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
x_lst = c(x)
df = get_cors(
data = filter(
select(val_train_Xy, all_of(num_feats)),
!is.na(.data[[x]])
),
x_lst = x_lst,
feats = num_feats
)
df
print("Summary of absolute values of Pearson's Rs:")
## [1] "Summary of absolute values of Pearson's Rs:"
df = abs(df)
summary(abs(df))
## OverallQual_int
## Min. :0.01751
## 1st Qu.:0.13440
## Median :0.27000
## Mean :0.31424
## 3rd Qu.:0.54465
## Max. :0.82099
df = melt(df)
ggplot(df, aes(x = variable, y = value)) +
geom_boxplot(notch = T) +
ylab(label = 'Absolute Value of Correlation to Other Features')
y_lst = c('log(SalePrice)', 'YearBuilt', 'YearRemodAdd', 'TotalBsmtSF',
'GrLivArea','FullBath', 'TotRmsAbvGrd', 'GarageYrBlt', 'GarageCars',
'GarageArea')
plot_scat_pairs(df = val_train_Xy, x = x, y_lst = y_lst)
## NULL
Again, there’s a lot of interaction with MSSubClass and Neighborhood. Quality also decreases with zoning density. Two-story houses are generally rated higher than one-story houses. Vinyl gets rated highest among exteriors. Simply having masonry improves quality rating. Poured concrete foundations on average receive 7s, whereas cinder block and brick/tile receive 5s on average. The same is true for attached and built-in garages compared to detached and no garages. It almost looks like it’s better to have no fence than to have one with minor privacy. Court officer deeds/estates are rated more poorly than warranty deeds and new sales. Abnormal sales are rated lower than normal sales.
Overall quality is doing a lot of the work for other features toward predicting price.
y_lst = c('MSSubClass', 'MSZoning', 'Neighborhood', 'BldgType', 'HouseStyle',
'RoofMatl', 'RoofStyle', 'Exterior1st', 'MasVnrType', 'Foundation',
'Heating', 'Electrical', 'Functional', 'GarageType', 'GarageFinish',
'Fence', 'MiscFeature', 'SaleType', 'SaleCondition')
for (y in y_lst) {
plt = fenced_jbv(data = val_train_Xy, x = y, y = x) +
theme(axis.text.x = element_text(angle = 45, hjust=1))
print(plt)
}
x = 'OverallCond'
y = 'SalePrice'
summarize_by(data = val_train_Xy, x = x, y = y)
y = 'log(SalePrice)'
sum_and_trans_fact(data = val_train_Xy, x = x, y = y)
## NULL
p_vals = get_signif_levels(data = val_train_Xy, x = x, z = y, min_n = 29)
heatmap.2(
x = as.matrix(p_vals$pval_df),
scale = 'none',
Rowv = F,
Colv = F,
dendrogram = 'none',
cellnote = format(p_vals$pval_df, digits = 2),
notecex = 0.75,
notecol = 'black',
main = paste(y, 'p-values'),
key = F
)
print(
paste(
"Levels w/ significantly different",
y,
"than another level:"
)
)
## [1] "Levels w/ significantly different log(SalePrice) than another level:"
print(p_vals$signif_levels)
## [1] "5" "6" "8" "4" "7"
OverallCond is like OverallQual, but with a much more pronounced mode (397 5s) left of median/mean (124 6s), probably due to wear and tear being more universal than quality construction.
The correlation to SalePrice is weaker, and 5s oddly seem to sell for more on average than higher-rated houses.
x = 'OverallCond_int'
val_train_Xy = val_train_Xy %>%
mutate(OverallCond_int = as.integer(OverallCond))
# Recalculate best normalizers. Might as well do them all, see if previous
# transformations benefit from further transformation, while we're at it.
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
best_normalizers = find_best_normalizer_per_feat(
df = val_train_Xy,
feats_lst = num_feats,
funcs_lst = funcs_lst,
exclude_vals = list(0)
)
summary(val_train_Xy[x])
## OverallCond_int
## Min. :1.00
## 1st Qu.:5.00
## Median :5.00
## Mean :5.55
## 3rd Qu.:6.00
## Max. :9.00
sum_and_trans_cont(
data = val_train_Xy,
x = x,
func = best_normalizers[[x]]$best_func$func,
func_name = best_normalizers[[x]]$best_func$name,
x_binw = 1,
t_binw = 1
)
## NULL
This feature is only weakly correlated to a couple of age features. It has no linear correlation to SalePrice.
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
x_lst = c(x)
df = get_cors(
data = filter(
select(val_train_Xy, all_of(num_feats)),
!is.na(.data[[x]])
),
x_lst = x_lst,
feats = num_feats
)
df
print("Summary of absolute values of Pearson's Rs:")
## [1] "Summary of absolute values of Pearson's Rs:"
df = abs(df)
summary(abs(df))
## OverallCond_int
## Min. :0.0008734
## 1st Qu.:0.0201988
## Median :0.0386786
## Mean :0.0676805
## 3rd Qu.:0.1009599
## Max. :0.3591485
df = melt(df)
ggplot(df, aes(x = variable, y = value)) +
geom_boxplot(notch = T) +
ylab(label = 'Absolute Value of Correlation to Other Features')
y_lst = c('log(SalePrice)', 'YearBuilt', 'YearRemodAdd', 'GarageYrBlt')
plot_scat_pairs(df = val_train_Xy, x = x, y_lst = y_lst)
## NULL
The bump in price among houses of average condition seems to have to do with a cluster of houses made in the late ’90s and early 2000s.
ggplot(
data = val_train_Xy,
mapping = aes(
x = OverallCond_int,
y = .data[['log(SalePrice)']],
color = YearBuilt
)
) +
geom_jitter(alpha = 0.5) +
geom_smooth() #+
# facet_wrap(vars(BldgType))
OldTown seems to be in relatively good shape, while Edwards has proportionately more houses in need of work and reconditioning. You can easily spot clustered 5s in the neighborhoods that likely grew up in the building boom of the 2000s.
y_lst = c('MSSubClass', 'MSZoning', 'Neighborhood', 'Condition1', 'BldgType',
'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'MasVnrType',
'BsmtExposure', 'Foundation', 'Heating', 'Electrical', 'Functional',
'GarageType', 'GarageFinish', 'Fence', 'MiscFeature', 'SaleType',
'SaleCondition')
for (y in y_lst) {
plt = fenced_jbv(data = val_train_Xy, x = y, y = x) +
theme(axis.text.x = element_text(angle = 45, hjust=1))
print(plt)
}
Houses with metal and wood exteriors are typically in better condition than those with vinyl despite houses with vinyl siding typically being rated as higher quality and newer. Likewise, houses with cinder block foundations seem to fare better over time than those with poured concrete despite quality ratings, according to this data set, as do houses with detached garages compared to those with attached and built-in garages. Is this because older houses and lower-quality houses that are still standing are the ones that have received better maintenance?
p1 = ggplot(
data = val_train_Xy,
mapping = aes(
y = OverallCond_int,
x = Exterior1st,
color = YearBuilt
)
) +
geom_jitter() +
geom_boxplot(alpha = 0) +
theme(axis.text.x = element_text(angle = 45, hjust=1))
p2 = ggplot(
data = val_train_Xy,
mapping = aes(
y = OverallCond_int,
x = Foundation,
color = YearBuilt
)
) +
geom_jitter() +
geom_boxplot(alpha = 0) +
theme(axis.text.x = element_text(angle = 45, hjust=1))
p3 = ggplot(
data = val_train_Xy,
mapping = aes(
y = OverallCond_int,
x = GarageType,
color = YearBuilt
)
) +
geom_jitter() +
geom_boxplot(alpha = 0) +
theme(axis.text.x = element_text(angle = 45, hjust=1))
grid.arrange(p1, p2, p3, ncol = 2)
No transformations normalized the distribution. You can see the construction boom between WWII and the ’80s (with a dip in mid ’70s stagflation), followed by the relative explosion starting in the late ’90s and dropping with the housing crisis in the 2000s.
Consider grouping around modes into a factor and dummy coding to accommodate some ML algorithms that wouldn’t handle a polymodal distribution well.
x = 'YearBuilt'
summary(val_train_Xy[x])
## YearBuilt
## Min. :1872
## 1st Qu.:1954
## Median :1972
## Mean :1971
## 3rd Qu.:2000
## Max. :2010
# sum_and_trans_cont(
# data = val_train_Xy,
# x = x,
# func = best_normalizers[[x]]$best_func$func,
# func_name = best_normalizers[[x]]$best_func$name,
# x_binw = 1,
# t_binw = 1
# )
ggplot(val_train_Xy, aes(x = YearBuilt)) +
geom_histogram(binwidth = 1)
x = 'YearBuilt'
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
x_lst = c(x)
df = get_cors(
data = filter(
select(val_train_Xy, all_of(num_feats)),
!is.na(.data[[x]])
),
x_lst = x_lst,
feats = num_feats
)
df
print("Summary of absolute values of Pearson's Rs:")
## [1] "Summary of absolute values of Pearson's Rs:"
df = abs(df)
summary(abs(df))
## YearBuilt
## Min. :0.0009914
## 1st Qu.:0.0534053
## Median :0.1616352
## Mean :0.2334731
## 3rd Qu.:0.3684433
## Max. :0.8301510
df = melt(df)
ggplot(df, aes(x = variable, y = value)) +
geom_boxplot(notch = T) +
ylab(label = 'Absolute Value of Correlation to Other Features')
y_lst = colnames(select(val_train_Xy, where(is.numeric)))
for (y in y_lst) {
for (x in x_lst) {
plt = ggplot(
select(val_train_Xy, all_of(c(x, y))),
aes(x = .data[[x]], y = .data[[y]])
) +
geom_jitter() +
geom_smooth() +
labs(x = x, y = y) +
scale_x_continuous(breaks = seq(1880, 2010, 5)) +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
print(plt)
}
}
y_lst = colnames(select(val_train_Xy, where(is.factor)))
for (y in y_lst) {
plt = fenced_jbv(data = val_train_Xy, x = y, y = x) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
print(plt)
}
Before looking at remodel year, this is the story I gather about houses purchased in this period based on the above visualizations of YearBuilt. I could try to weave the viz in with the narrative, but I’m not going to.
Houses and garages have gotten bigger over the years, with more bathrooms and less enclosed porch space. (I’m curious to see whether the increasing rarity of enclosed porch space led to it being a prized (priced) feature, or if it simply isn’t as valued now.) Newer homes tend to be valued more, and builders generally began to produce higher-quality houses.
Most of the houses built in the post-WWII boom were single-story, but that’s also when 2 1/2 stories became unheard-of. Split/multi-level and duplexes began to come on the scene. 1 1/2 stories had their heyday. Some of the houses from then and earlier make up the group of two-family conversions. It would be interesting to check remodel years to see when those conversions tended to take place.
As the century turned, prices grew exponentially. Two-story houses and PUD housing became more prevalent, and townhouses appeared for the first time. Zoning became less dense as new suburbs sprang up. Lots became more irregular. Cul de sacs and parks started to sprinkle in as feeder streets networked out. A handful of houses began to appear near railroads.
Even as masonry became a growing bling factor driving up overall quality, the age of plastic ushered in vinyl siding as the dominant exterior. The overall condition of houses from the turn of the century on is starkly average compared to older houses which are commonly in good shape. Checking against remodel years will likely explain some of this, but I’m curious to compare exterior conditions of different materials with regard to age.
The Lost Generation got brick and tile foundations. Boomers got cinder block and slabs. Gen x and Millenials got poured concrete. This enabled us to build into the sides of hills better and have more exposed basements with walkouts.
Most of the basementless houses were built during the post-WWII boom when ramblers were a common answer to the burgeoning dream of homeownership. Basement area continuously grew from then on, especially finished basement area as people began to live more underground. Kitchens and bedrooms began to be more common below ground.
Kitchens became more of a target for adding value with improved materials, construction, and appliances. Heating systems improved over the years as well. New electrical standards came into place in 1960.
When the ’80s hit, the automobiled society was in full swing, and it was rare to see less than a two-car garage built anymore, let alone a house without a garage. Rather than a detached secondary building, garages became part of the main house itself. It was more often a finished space for more than just housing and working on cars, but without the need for high-quality construction. Unpaved driveways were now out of the question, though.
After 2000, virtually no new houses had fences, at least none that were bought during the years this data set covers.
I’ll condense the housing booms around their modes in a factor that I can one-hot encode to accommodate some ML algorithms that wouldn’t handle a polymodal distribution well. To maintain the model going forward, new periods will have to be identified and added.
# Three periods: < 1945 < 1985ish < 2010
val_train_Xy = val_train_Xy %>%
mutate(
YearBuilt.fact = cut(
x = YearBuilt,
breaks = c(-Inf, 1944, 1986, Inf),
ordered_result = T,
labels = c('Before_1945', '1945_1986', 'After_1986')
)
)
p1 = ggplot(val_train_Xy, aes(x = YearBuilt)) +
geom_histogram(binwidth = 1) +
scale_x_continuous(breaks = seq(1880, 2010, 5)) +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
p2 = ggplot(val_train_Xy, aes(x = YearBuilt.fact)) +
geom_bar()
grid.arrange(p1, p2)
x = 'YearBuilt.fact'
y = 'SalePrice'
summarize_by(data = val_train_Xy, x = x, y = y)
y = 'log(SalePrice)'
sum_and_trans_fact(data = val_train_Xy, x = x, y = y)
## NULL
p_vals = get_signif_levels(data = val_train_Xy, x = x, z = y, min_n = 29)
heatmap.2(
x = as.matrix(p_vals$pval_df),
scale = 'none',
Rowv = F,
Colv = F,
dendrogram = 'none',
cellnote = format(p_vals$pval_df, digits = 2),
notecex = 0.75,
notecol = 'black',
main = paste(y, 'p-values'),
key = F
)
print(
paste(
"Levels w/ significantly different",
y,
"than another level:"
)
)
## [1] "Levels w/ significantly different log(SalePrice) than another level:"
print(p_vals$signif_levels)
## [1] "After_1986" "Before_1945" "1945_1986"
It might make sense to transform year built into the age of the house when bought. The years of sales is a small range, but it may add a touch more predictive information, especially for the newer houses.
It seems to have added a couple of outliers, but applying a square-root transformation better centers it and removes the outliers.
0s don’t indicate a missing feature, so I want to include them in the search for the best normalizer.
val_train_Xy = val_train_Xy %>%
mutate('Age' = (YrSold - YearBuilt))
x = 'Age'
# Include 0s. Just recalculate rather than rewrite code.
num_feats = c(x)
best_normalizers = find_best_normalizer_per_feat(
df = val_train_Xy,
feats_lst = num_feats,
funcs_lst = funcs_lst,
exclude_vals = list(-1)
)
summary(val_train_Xy[x])
## Age
## Min. : 0.00
## 1st Qu.: 8.00
## Median : 35.00
## Mean : 36.83
## 3rd Qu.: 55.00
## Max. :136.00
sum_and_trans_cont(
data = val_train_Xy,
x = x,
func = best_normalizers[[x]]$best_func$func,
func_name = best_normalizers[[x]]$best_func$name,
x_binw = 1,
t_binw = 1/10
)
## NULL
x = 'sqrt(Age)'
val_train_Xy = val_train_Xy %>%
mutate('sqrt(Age)' = sqrt(Age))
# Include 0s. Just recalculate rather than rewrite code.
num_feats = c(x)
best_normalizers = find_best_normalizer_per_feat(
df = val_train_Xy,
feats_lst = num_feats,
funcs_lst = funcs_lst,
exclude_vals = list(-1)
)
summary(val_train_Xy[x])
## sqrt(Age)
## Min. : 0.000
## 1st Qu.: 2.828
## Median : 5.916
## Mean : 5.341
## 3rd Qu.: 7.416
## Max. :11.662
sum_and_trans_cont(
data = val_train_Xy,
x = x,
func = best_normalizers[[x]]$best_func$func,
func_name = best_normalizers[[x]]$best_func$name,
x_binw = 1/10,
t_binw = 1/10
)
## NULL
# Recalculate best normalizers.
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
best_normalizers = find_best_normalizer_per_feat(
df = val_train_Xy,
feats_lst = num_feats,
funcs_lst = funcs_lst,
exclude_vals = list(0)
)
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
x_lst = c('YearBuilt', 'Age', 'sqrt(Age)')
df = get_cors(
data = filter(
select(val_train_Xy, all_of(num_feats)),
!is.na(.data[[x]])
),
x_lst = x_lst,
feats = num_feats
)
df
print("Summary of absolute values of Pearson's Rs:")
## [1] "Summary of absolute values of Pearson's Rs:"
df = abs(df)
summary(abs(df))
## YearBuilt Age sqrt(Age)
## Min. :0.0009914 Min. :0.00325 Min. :0.000536
## 1st Qu.:0.0534053 1st Qu.:0.06282 1st Qu.:0.081377
## Median :0.1616352 Median :0.15871 Median :0.165492
## Mean :0.2334731 Mean :0.23488 Mean :0.248951
## 3rd Qu.:0.3684433 3rd Qu.:0.36838 3rd Qu.:0.359191
## Max. :0.8301510 Max. :0.82972 Max. :0.847412
df = melt(df)
ggplot(df, aes(x = variable, y = value)) +
geom_boxplot(notch = T) +
ylab(label = 'Absolute Value of Correlation to Other Features')
y_lst = c('log(SalePrice)', 'Win(log(SalePrice))', 'GarageArea', 'GarageCars',
'EnclosedPorch', 'OpenPorchSF', 'OverallQual_int', 'OverallCond_int')
plot_scat_pairs(df = val_train_Xy, x = x, y_lst = y_lst)
## NULL
There’s a small increase in linear correlation of Age to transformed SalePrice from YearBuilt (0.59 to 0.62). Correlations to some measures of size strengthened. The correlation to quality rose quite a bit (from 0.59 to 0.65) while the correlation to condition only rose slightly (0.36 to 0.37), remaining weak.
With YearBuilt, there’s still a polynomial appearance to the plot against log(SalePrice), but that’s smoothed out with sqrt(Age). YearRemodAdd may add clarity.
The concentration of average-condition houses among the youngest is still puzzling. It’s as if a few years of aging tends to improve the condition of a house. Perhaps owners tend to add cosmetic value in the first years. Maybe assessors use the youngest houses as the anchor by which to assess all others. Many of these houses were unfinished, which explains some but not all of this.
ggplot(
val_train_Xy,
aes(
x = .data[['sqrt(Age)']],
y = OverallCond_int,
shape = SaleCondition,
color = SaleCondition
)
) +
geom_jitter()
val_train_Xy = select(val_train_Xy, -c('Age'))
x = 'YearRemodAdd'
summary(val_train_Xy[x])
## YearRemodAdd
## Min. :1950
## 1st Qu.:1966
## Median :1994
## Mean :1985
## 3rd Qu.:2004
## Max. :2010
sum_and_trans_cont(
data = val_train_Xy,
x = x,
func = best_normalizers[[x]]$best_func$func,
func_name = best_normalizers[[x]]$best_func$name,
x_binw = 1,
t_binw = 1
)
## NULL
There was a curiously large number of remodels in 1950, about 80 remodels that year compared to the peak of about 50 in the 2000s. These were all built before 1950, and no house has an earlier remodel year than 1950, and some houses built before 1950 have remodel years later than 1950.
I’ll treat houses with a 1950 remodel as if they have not had a remodel; they may have had an earlier remodel, but I’m guessing that those older remodels are of little added value at this point. Ames assessors may have had reason to bottom-code, but I’ll set remodel year to built year in years prior to 1950 for now and see if it helps or hinders analysis and prediction.
val_train_Xy = val_train_Xy %>%
mutate(YearRemodAdd.uncode = ifelse(YearRemodAdd == 1950, YearBuilt, YearRemodAdd))
x = 'YearRemodAdd.uncode'
# Recalculate best normalizers.
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
best_normalizers = find_best_normalizer_per_feat(
df = val_train_Xy,
feats_lst = num_feats,
funcs_lst = funcs_lst,
exclude_vals = list(0)
)
summary(val_train_Xy[x])
## YearRemodAdd.uncode
## Min. :1880
## 1st Qu.:1966
## Median :1994
## Mean :1982
## 3rd Qu.:2004
## Max. :2010
# sum_and_trans_cont(
# data = val_train_Xy,
# x = x,
# func = best_normalizers[[x]]$best_func$func,
# func_name = best_normalizers[[x]]$best_func$name,
# x_binw = 1,
# t_binw = 1
# )
ggplot(val_train_Xy, aes(x = .data[[x]])) +
geom_histogram(binwidth = 1)
This only added outliers and slightly weakened the correlation to SalePrice.
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
x_lst = c('YearBuilt', 'YearRemodAdd', 'YearRemodAdd.uncode')
df = get_cors(
data = filter(
select(val_train_Xy, all_of(num_feats)),
!is.na(.data[[x]])
),
x_lst = x_lst,
feats = num_feats
)
df
print("Summary of absolute values of Pearson's Rs:")
## [1] "Summary of absolute values of Pearson's Rs:"
df = abs(df)
summary(abs(df))
## YearBuilt YearRemodAdd YearRemodAdd.uncode
## Min. :0.0009914 Min. :0.004039 Min. :0.001117
## 1st Qu.:0.0534053 1st Qu.:0.055532 1st Qu.:0.063792
## Median :0.1616352 Median :0.152282 Median :0.145630
## Mean :0.2421409 Mean :0.203658 Mean :0.198553
## 3rd Qu.:0.3684433 3rd Qu.:0.294762 3rd Qu.:0.268835
## Max. :0.9624094 Max. :0.654782 Max. :0.657064
df = melt(df)
ggplot(df, aes(x = variable, y = value)) +
geom_boxplot(notch = T) +
ylab(label = 'Absolute Value of Correlation to Other Features')
y_lst = c('log(SalePrice)')
plot_scat_pairs(df = val_train_Xy, x = x, y_lst = y_lst)
## NULL
y_lst = colnames(select(val_train_Xy, where(is.factor)))
for (y in y_lst) {
plt = fenced_jbv(data = val_train_Xy, x = y, y = x) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
print(plt)
}
val_train_Xy = select(val_train_Xy, -c('YearRemodAdd.uncode'))
Removing the records in which there is no remodel (i.e. YearRemodAdd == YearBuilt or 1950) adds clarity. 368 rows were removed for no remodel, and remodels really started being recorded in increasing numbers starting in the ’90s, maybe a little in the late ’80s.
ggplot(
val_train_Xy,
aes(x = ifelse(YearRemodAdd == YearBuilt, NA, YearRemodAdd)
)
) +
geom_histogram(binwidth = 1)
Because there are so many houses without remodels, I’ll split it into a factor, a level for each decade with the year 1950 conveniently lumped in with NAs.
x = 'YearRemodAdd.fact'
val_train_Xy = val_train_Xy %>%
mutate(
YearRemodAdd.fact = factor(
cut(
ifelse(
YearRemodAdd == YearBuilt | is.na(YearRemodAdd),
1949,
YearRemodAdd
),
breaks = c(1949, 1950, 1960, 1970, 1980, 1990, 2000, 2010),
labels = c('None', '50s', '60s', '70s', '80s', '90s', '00s')
)
)
) #%>%
# select(-c('YearRemodAdd'))
y = 'SalePrice'
summarize_by(data = val_train_Xy, x = x, y = y)
y = 'log(SalePrice)'
sum_and_trans_fact(data = val_train_Xy, x = x, y = y)
## NULL
p_vals = get_signif_levels(data = val_train_Xy, x = x, z = y, min_n = 29)
heatmap.2(
x = as.matrix(p_vals$pval_df),
scale = 'none',
Rowv = F,
Colv = F,
dendrogram = 'none',
cellnote = format(p_vals$pval_df, digits = 2),
notecex = 0.75,
notecol = 'black',
main = paste(y, 'p-values'),
key = F
)
print(
paste(
"Levels w/ significantly different",
y,
"than another level:"
)
)
## [1] "Levels w/ significantly different log(SalePrice) than another level:"
print(p_vals$signif_levels)
## [1] "None" "00s" "90s"
Most are Gable (563), and many are Hip (139). Flat, Gambrel, and Mansard are neglible.
x = 'RoofStyle'
y = 'SalePrice'
summarize_by(data = val_train_Xy, x = x, y = y)
y = 'log(SalePrice)'
sum_and_trans_fact(data = val_train_Xy, x = x, y = y)
## NULL
p_vals = get_signif_levels(data = val_train_Xy, x = x, z = y, min_n = 29)
heatmap.2(
x = as.matrix(p_vals$pval_df),
scale = 'none',
Rowv = F,
Colv = F,
dendrogram = 'none',
cellnote = format(p_vals$pval_df, digits = 2),
notecex = 0.75,
notecol = 'black',
main = paste(y, 'p-values'),
key = F
)
print(
paste(
"Levels w/ significantly different",
y,
"than another level:"
)
)
## [1] "Levels w/ significantly different log(SalePrice) than another level:"
print(p_vals$signif_levels)
## [1] "Gable" "Hip"
Vast majority are composition shingle (702). Can drop this feature.
summary(val_train_Xy$RoofMatl)
## ClyTile CompShg Membran Metal Roll Tar&Grv WdShake WdShngl
## 1 705 1 0 0 4 1 3
val_train_Xy = select(val_train_Xy, -c('RoofMatl'))
Most popular class is vinyl (255 and 249), but wood, metal, and others represent significant classes. No ‘None’ in 2nd, so all houses in Ames have at least two types of siding?
x = 'Exterior1st'
y = 'SalePrice'
summarize_by(data = val_train_Xy, x = x, y = y)
y = 'log(SalePrice)'
sum_and_trans_fact(data = val_train_Xy, x = x, y = y)
## NULL
p_vals = get_signif_levels(data = val_train_Xy, x = x, z = y, min_n = 29)
heatmap.2(
x = as.matrix(p_vals$pval_df),
scale = 'none',
Rowv = F,
Colv = F,
dendrogram = 'none',
cellnote = format(p_vals$pval_df, digits = 2),
notecex = 0.75,
notecol = 'black',
main = paste(y, 'p-values'),
key = F
)
print(
paste(
"Levels w/ significantly different",
y,
"than another level:"
)
)
## [1] "Levels w/ significantly different log(SalePrice) than another level:"
print(p_vals$signif_levels)
## [1] "VinylSd" "MetalSd" "Wd Sdng" "HdBoard" "Plywood"
x = 'Exterior2nd'
y = 'SalePrice'
summarize_by(data = val_train_Xy, x = x, y = y)
y = 'log(SalePrice)'
sum_and_trans_fact(data = val_train_Xy, x = x, y = y)
## NULL
p_vals = get_signif_levels(data = val_train_Xy, x = x, z = y, min_n = 29)
heatmap.2(
x = as.matrix(p_vals$pval_df),
scale = 'none',
Rowv = F,
Colv = F,
dendrogram = 'none',
cellnote = format(p_vals$pval_df, digits = 2),
notecex = 0.75,
notecol = 'black',
main = paste(y, 'p-values'),
key = F
)
print(
paste(
"Levels w/ significantly different",
y,
"than another level:"
)
)
## [1] "Levels w/ significantly different log(SalePrice) than another level:"
print(p_vals$signif_levels)
## [1] "VinylSd" "MetalSd" "Wd Sdng" "HdBoard" "Plywood"
Most have none (430), but plenty of brick (219) and stone (66). No cinderblock in the split.
x = 'MasVnrType'
y = 'SalePrice'
summarize_by(data = val_train_Xy, x = x, y = y)
y = 'log(SalePrice)'
sum_and_trans_fact(data = val_train_Xy, x = x, y = y)
## NULL
p_vals = get_signif_levels(data = val_train_Xy, x = x, z = y, min_n = 29)
heatmap.2(
x = as.matrix(p_vals$pval_df),
scale = 'none',
Rowv = F,
Colv = F,
dendrogram = 'none',
cellnote = format(p_vals$pval_df, digits = 2),
notecex = 0.75,
notecol = 'black',
main = paste(y, 'p-values'),
key = F
)
print(
paste(
"Levels w/ significantly different",
y,
"than another level:"
)
)
## [1] "Levels w/ significantly different log(SalePrice) than another level:"
print(p_vals$signif_levels)
## [1] "BrkFace" "None" "Stone"
x = 'MasVnrArea'
summary(val_train_Xy[x])
## MasVnrArea
## Min. : 0.0
## 1st Qu.: 0.0
## Median : 0.0
## Mean : 103.3
## 3rd Qu.: 166.5
## Max. :1129.0
sum_and_trans_cont(
data = val_train_Xy[val_train_Xy$MasVnrArea != 0, ],
x = x,
func = best_normalizers[[x]]$best_func$func,
func_name = best_normalizers[[x]]$best_func$name,
x_binw = 10,
t_binw = 0.25
)
## NULL
x = 'cbrt(MasVnrArea)'
val_train_Xy = val_train_Xy %>%
mutate('cbrt(MasVnrArea)' = MasVnrArea^(1/3))
# Recalculate best normalizers.
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
best_normalizers = find_best_normalizer_per_feat(
df = val_train_Xy,
feats_lst = num_feats,
funcs_lst = funcs_lst,
exclude_vals = list(0)
)
summary(val_train_Xy[x])
## cbrt(MasVnrArea)
## Min. : 0.000
## 1st Qu.: 0.000
## Median : 0.000
## Mean : 2.391
## 3rd Qu.: 5.501
## Max. :10.413
sum_and_trans_cont(
data = val_train_Xy[val_train_Xy$MasVnrArea != 0, ],
x = x,
func = best_normalizers[[x]]$best_func$func,
func_name = best_normalizers[[x]]$best_func$name,
x_binw = 0.25,
t_binw = 0.25
)
## NULL
Because this variable’s 0s indicate a missing feature, we’re really concerned with Winsorizing non-zero values.
df = val_train_Xy[val_train_Xy$MasVnrArea != 0, ]
qqnorm(y = df$MasVnrArea, ylab = 'MasVnrArea')
qqline(y = df$MasVnrArea, ylab = 'MasVnrArea')
qqnorm(y = df[[x]], ylab = x)
qqline(y = df[[x]], ylab = x)
qqnorm(y = sqrt(sqrt(df$MasVnrArea)), ylab = 'sqrt(sqrt(MasVnrArea))')
qqline(y = sqrt(sqrt(df$MasVnrArea)), ylab = 'sqrt(sqrt(MasVnrArea))')
Win_sqrt_x = Winsorize(
x = sqrt(sqrt(df$MasVnrArea)),
probs = c(0.005, 0.995),
na.rm = T
)
qqnorm(y = Win_sqrt_x, ylab = 'Win(sqrt(sqrt(MasVnrArea)))')
qqline(y = Win_sqrt_x, ylab = 'Win(sqrt(sqrt(MasVnrArea)))')
Win_cbrt_x = Winsorize(
x = df$`cbrt(MasVnrArea)`,
probs = c(0.005, 0.995),
na.rm = T
)
qqnorm(y = Win_cbrt_x, ylab = 'Win(cbrt(MasVnrArea))')
qqline(y = Win_cbrt_x, ylab = 'Win(cbrt(MasVnrArea))')
Win_raw_x = Winsorize(
x = df$MasVnrArea,
probs = c(0.05, 0.95),
na.rm = T
)
qqnorm(y = Win_raw_x, ylab = 'Win(MasVnrArea)')
qqline(y = Win_raw_x, ylab = 'Win(MasVnrArea)')
print(shapiro.test(df$MasVnrArea))
##
## Shapiro-Wilk normality test
##
## data: df$MasVnrArea
## W = 0.86407, p-value = 3.522e-15
print(shapiro.test(df$'cbrt(MasVnrArea)'))
##
## Shapiro-Wilk normality test
##
## data: df$"cbrt(MasVnrArea)"
## W = 0.99498, p-value = 0.4769
print(shapiro.test(sqrt(sqrt(df$MasVnrArea))))
##
## Shapiro-Wilk normality test
##
## data: sqrt(sqrt(df$MasVnrArea))
## W = 0.99153, p-value = 0.09952
print(shapiro.test(Win_sqrt_x))
##
## Shapiro-Wilk normality test
##
## data: Win_sqrt_x
## W = 0.99525, p-value = 0.5281
print(shapiro.test(Win_cbrt_x))
##
## Shapiro-Wilk normality test
##
## data: Win_cbrt_x
## W = 0.99576, p-value = 0.6327
print(shapiro.test(Win_raw_x))
##
## Shapiro-Wilk normality test
##
## data: Win_raw_x
## W = 0.90384, p-value = 1.551e-12
min_val = min(Win_cbrt_x)
max_val = max(Win_cbrt_x)
val_train_Xy = val_train_Xy %>%
mutate(
'Win(cbrt(MasVnrArea))' = ifelse(
MasVnrArea == 0,
0,
Winsorize(
x = `cbrt(MasVnrArea)`,
# probs = c(0.005, 0.995),
minval = min_val,
maxval = max_val
)
)
)
x = 'Win(cbrt(MasVnrArea))'
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
x_lst = c('MasVnrArea', 'cbrt(MasVnrArea)', x)
df = get_cors(
data = filter(
select(val_train_Xy, all_of(num_feats)),
.data[[x]] != 0
),
x_lst = x_lst,
feats = num_feats
)
df
print("Summary of absolute values of Pearson's Rs (no 0s):")
## [1] "Summary of absolute values of Pearson's Rs (no 0s):"
df = abs(df)
summary(abs(df))
## MasVnrArea cbrt(MasVnrArea) Win(cbrt(MasVnrArea))
## Min. :0.01480 Min. :0.00778 Min. :0.005013
## 1st Qu.:0.06893 1st Qu.:0.07272 1st Qu.:0.071175
## Median :0.10170 Median :0.13331 Median :0.134064
## Mean :0.14780 Mean :0.15174 Mean :0.151855
## 3rd Qu.:0.23594 3rd Qu.:0.23006 3rd Qu.:0.230854
## Max. :0.38455 Max. :0.36920 Max. :0.372665
## NA's :1 NA's :1 NA's :1
df = melt(df)
ggplot(df, aes(x = variable, y = value)) +
geom_boxplot(notch = T) +
ylab(label = 'Absolute Value of Correlation to Other Features (no 0s)')
df = get_cors(
data = filter(
select(val_train_Xy, all_of(num_feats)),
!is.na(.data[[x]])
),
x_lst = x_lst,
feats = num_feats
)
df
print("Summary of absolute values of Pearson's Rs:")
## [1] "Summary of absolute values of Pearson's Rs:"
df = abs(df)
summary(abs(df))
## MasVnrArea cbrt(MasVnrArea) Win(cbrt(MasVnrArea))
## Min. :0.0009883 Min. :0.007318 Min. :0.007091
## 1st Qu.:0.0727252 1st Qu.:0.068257 1st Qu.:0.068430
## Median :0.1386111 Median :0.152985 Median :0.153200
## Mean :0.1806451 Mean :0.187874 Mean :0.187854
## 3rd Qu.:0.3070987 3rd Qu.:0.314667 3rd Qu.:0.314633
## Max. :0.4212846 Max. :0.430102 Max. :0.430096
df = melt(df)
ggplot(df, aes(x = variable, y = value)) +
geom_boxplot(notch = T) +
ylab(label = 'Absolute Value of Correlation to Other Features')
y_lst = c('log(SalePrice)')
for (feat in x_lst) {
plot_scat_pairs(df = val_train_Xy, x = feat, y_lst = y_lst)
}
This variable has a lot of zeros that indicate a missing feature that make it a candidate for binary encoding to aid linear regression. But, MasVnrType already encodes that in ‘None’. Because there’s a significant difference in price between MasVnrType levels, that should suffice. Also, leaving the 0s in improves the correlation to the target variable, so it may be moot.
# already hardcoded this one.
print(paste("min_val:", min_val))
## [1] "min_val: 1.52019153849196"
print(paste("max_val:", max_val))
## [1] "max_val: 10.278035603362"
ggplot(val_train_Xy, aes(x = .data[[x]])) +
geom_histogram(binwidth = .2)
y_lst = c('MasVnrType')
z = 'SalePrice.fact'
for (y in y_lst) {
plt = fenced_jbv(
data = val_train_Xy,
x = y,
y = 'cbrt(MasVnrArea)',
jit_col = z,
jit_alpha = 0.75,
leg_lb = z,
box_color = 'red'
) +
theme(axis.text.x = element_text(angle = 45, hjust=1))
print(plt)
}
ggplot(val_train_Xy, aes(x = `cbrt(MasVnrArea)`, y = `log(SalePrice)`)) +
geom_point(alpha = 0.5) +
geom_smooth(method = 'lm') +
facet_wrap(vars(MasVnrType))
There are some houses with masonry footage but no type ascribed. I noticed this during the wrangling audit and decided to leave it for the modeling phase, maybe to impute the masonry type with caret using a select few variables as indicators or simply imputing the most common type (BrkFace), but maybe create a new level for those handful of undescribed masonry types.
Mostly average (440), many good (241).
x = 'ExterQual'
y = 'SalePrice'
summarize_by(data = val_train_Xy, x = x, y = y)
y = 'log(SalePrice)'
sum_and_trans_fact(data = val_train_Xy, x = x, y = y)
## NULL
p_vals = get_signif_levels(data = val_train_Xy, x = x, z = y, min_n = 29)
heatmap.2(
x = as.matrix(p_vals$pval_df),
scale = 'none',
Rowv = F,
Colv = F,
dendrogram = 'none',
cellnote = format(p_vals$pval_df, digits = 2),
notecex = 0.75,
notecol = 'black',
main = paste(y, 'p-values'),
key = F
)
print(
paste(
"Levels w/ significantly different",
y,
"than another level:"
)
)
## [1] "Levels w/ significantly different log(SalePrice) than another level:"
print(p_vals$signif_levels)
## [1] "Gd" "TA"
Greater majority average (621), still some good (74). May be worth keeping.
x = 'ExterCond'
y = 'SalePrice'
summarize_by(data = val_train_Xy, x = x, y = y)
y = 'log(SalePrice)'
sum_and_trans_fact(data = val_train_Xy, x = x, y = y)
## NULL
p_vals = get_signif_levels(data = val_train_Xy, x = x, z = y, min_n = 29)
heatmap.2(
x = as.matrix(p_vals$pval_df),
scale = 'none',
Rowv = F,
Colv = F,
dendrogram = 'none',
cellnote = format(p_vals$pval_df, digits = 2),
notecex = 0.75,
notecol = 'black',
main = paste(y, 'p-values'),
key = F
)
print(
paste(
"Levels w/ significantly different",
y,
"than another level:"
)
)
## [1] "Levels w/ significantly different log(SalePrice) than another level:"
print(p_vals$signif_levels)
## character(0)
Evenly split between poured concrete and cinder block (317 and 302), but still 72 brick and tile.
x = 'Foundation'
y = 'SalePrice'
summarize_by(data = val_train_Xy, x = x, y = y)
y = 'log(SalePrice)'
sum_and_trans_fact(data = val_train_Xy, x = x, y = y)
## NULL
p_vals = get_signif_levels(data = val_train_Xy, x = x, z = y, min_n = 29)
heatmap.2(
x = as.matrix(p_vals$pval_df),
scale = 'none',
Rowv = F,
Colv = F,
dendrogram = 'none',
cellnote = format(p_vals$pval_df, digits = 2),
notecex = 0.75,
notecol = 'black',
main = paste(y, 'p-values'),
key = F
)
print(
paste(
"Levels w/ significantly different",
y,
"than another level:"
)
)
## [1] "Levels w/ significantly different log(SalePrice) than another level:"
print(p_vals$signif_levels)
## [1] "PConc" "BrkTil" "CBlock"
Could drop Stone and Wood and make it an ordered factor to represent as ints in regression or to one-hot.
Evenly split between average and good (313 and 303), but 63 excellent, and only 26 without basements. I’m having trouble imagining houses with basements and cinder block foundations.
x = 'BsmtQual'
y = 'SalePrice'
summarize_by(data = val_train_Xy, x = x, y = y)
y = 'log(SalePrice)'
sum_and_trans_fact(data = val_train_Xy, x = x, y = y)
## NULL
p_vals = get_signif_levels(data = val_train_Xy, x = x, z = y, min_n = 29)
heatmap.2(
x = as.matrix(p_vals$pval_df),
scale = 'none',
Rowv = F,
Colv = F,
dendrogram = 'none',
cellnote = format(p_vals$pval_df, digits = 2),
notecex = 0.75,
notecol = 'black',
main = paste(y, 'p-values'),
key = F
)
print(
paste(
"Levels w/ significantly different",
y,
"than another level:"
)
)
## [1] "Levels w/ significantly different log(SalePrice) than another level:"
print(p_vals$signif_levels)
## [1] "Gd" "TA" "Ex"
Vast majority average (635).
x = 'BsmtCond'
y = 'SalePrice'
summarize_by(data = val_train_Xy, x = x, y = y)
y = 'log(SalePrice)'
sum_and_trans_fact(data = val_train_Xy, x = x, y = y)
## NULL
p_vals = get_signif_levels(data = val_train_Xy, x = x, z = y, min_n = 29)
heatmap.2(
x = as.matrix(p_vals$pval_df),
scale = 'none',
Rowv = F,
Colv = F,
dendrogram = 'none',
cellnote = format(p_vals$pval_df, digits = 2),
notecex = 0.75,
notecol = 'black',
main = paste(y, 'p-values'),
key = F
)
print(
paste(
"Levels w/ significantly different",
y,
"than another level:"
)
)
## [1] "Levels w/ significantly different log(SalePrice) than another level:"
print(p_vals$signif_levels)
## character(0)
Great majority (about 464) not exposed, but still some average (100), good (71), and minimal exposure (54).
x = 'BsmtExposure'
y = 'SalePrice'
summarize_by(data = val_train_Xy, x = x, y = y)
y = 'log(SalePrice)'
sum_and_trans_fact(data = val_train_Xy, x = x, y = y)
## NULL
p_vals = get_signif_levels(data = val_train_Xy, x = x, z = y, min_n = 29)
heatmap.2(
x = as.matrix(p_vals$pval_df),
scale = 'none',
Rowv = F,
Colv = F,
dendrogram = 'none',
cellnote = format(p_vals$pval_df, digits = 2),
notecex = 0.75,
notecol = 'black',
main = paste(y, 'p-values'),
key = F
)
print(
paste(
"Levels w/ significantly different",
y,
"than another level:"
)
)
## [1] "Levels w/ significantly different log(SalePrice) than another level:"
print(p_vals$signif_levels)
## [1] "No" "Av" "Mn" "Gd"
211 are unfinished, 205 are top quality, and descending counts to low quality (34).
x = 'BsmtFinType1'
y = 'SalePrice'
summarize_by(data = val_train_Xy, x = x, y = y)
y = 'log(SalePrice)'
sum_and_trans_fact(data = val_train_Xy, x = x, y = y)
## NULL
p_vals = get_signif_levels(data = val_train_Xy, x = x, z = y, min_n = 29)
heatmap.2(
x = as.matrix(p_vals$pval_df),
scale = 'none',
Rowv = F,
Colv = F,
dendrogram = 'none',
cellnote = format(p_vals$pval_df, digits = 2),
notecex = 0.75,
notecol = 'black',
main = paste(y, 'p-values'),
key = F
)
print(
paste(
"Levels w/ significantly different",
y,
"than another level:"
)
)
## [1] "Levels w/ significantly different log(SalePrice) than another level:"
print(p_vals$signif_levels)
## [1] "GLQ" "Unf" "BLQ" "ALQ" "LwQ" "Rec"
Probably just one-hot GLQ, LwQ, and None for regression.
x = 'BsmtFinSF1'
summary(val_train_Xy[x])
## BsmtFinSF1
## Min. : 0.0
## 1st Qu.: 0.0
## Median : 375.0
## Mean : 446.5
## 3rd Qu.: 699.0
## Max. :5644.0
sum_and_trans_cont(
data = val_train_Xy[val_train_Xy$BsmtFinSF1 != 0, ],
x = x,
func = best_normalizers[[x]]$best_func$func,
func_name = best_normalizers[[x]]$best_func$name,
x_binw = 50,
t_binw = 0.25
)
## NULL
x = 'cbrt(BsmtFinSF1)'
val_train_Xy = val_train_Xy %>%
mutate('cbrt(BsmtFinSF1)' = BsmtFinSF1^(1/3))
# Recalculate best normalizers.
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
best_normalizers = find_best_normalizer_per_feat(
df = val_train_Xy,
feats_lst = num_feats,
funcs_lst = funcs_lst,
exclude_vals = list(0)
)
summary(val_train_Xy[x])
## cbrt(BsmtFinSF1)
## Min. : 0.000
## 1st Qu.: 0.000
## Median : 7.211
## Mean : 5.564
## 3rd Qu.: 8.875
## Max. :17.804
sum_and_trans_cont(
data = val_train_Xy[val_train_Xy$BsmtFinSF1 != 0, ],
x = x,
func = best_normalizers[[x]]$best_func$func,
func_name = best_normalizers[[x]]$best_func$name,
x_binw = 0.25,
t_binw = 0.25
)
## NULL
ggplot(val_train_Xy, aes(x = .data[['cbrt(BsmtFinSF1)']])) +
geom_histogram()
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
x_lst = c('BsmtFinSF1', 'cbrt(BsmtFinSF1)')
df = get_cors(
data = filter(
select(val_train_Xy, all_of(num_feats)),
!is.na(.data[[x]])
),
x_lst = x_lst,
feats = num_feats
)
df
print("Summary of absolute values of Pearson's Rs:")
## [1] "Summary of absolute values of Pearson's Rs:"
df = abs(df)
summary(abs(df))
## BsmtFinSF1 cbrt(BsmtFinSF1)
## Min. :0.006172 Min. :0.000255
## 1st Qu.:0.079152 1st Qu.:0.069259
## Median :0.218996 Median :0.115415
## Mean :0.220147 Mean :0.157082
## 3rd Qu.:0.318490 3rd Qu.:0.196450
## Max. :0.644301 Max. :0.643687
df = melt(df)
ggplot(df, aes(x = variable, y = value)) +
geom_boxplot(notch = T) +
ylab(label = 'Absolute Value of Correlation to Other Features')
y_lst = c('log(SalePrice)')
plot_scat_pairs(df = val_train_Xy, x = x, y_lst = y_lst)
## NULL
plot_scat_pairs(df = val_train_Xy, x = 'BsmtFinSF1', y_lst = y_lst)
## NULL
This will be dropped in favor of a derivative variable, total basement sf, so no need to go any further.
val_train_Xy = select(val_train_Xy, -c('cbrt(BsmtFinSF1)'))
Mostly unfinished (609) but 27 no basements. So, only one house in Ames with a basement doesn’t have a second basement?? This doesn’t seem right. It might be worth dropping this feature.
x = 'BsmtFinType2'
y = 'SalePrice'
summarize_by(data = val_train_Xy, x = x, y = y)
y = 'log(SalePrice)'
sum_and_trans_fact(data = val_train_Xy, x = x, y = y)
## NULL
p_vals = get_signif_levels(data = val_train_Xy, x = x, z = y, min_n = 29)
heatmap.2(
x = as.matrix(p_vals$pval_df),
scale = 'none',
Rowv = F,
Colv = F,
dendrogram = 'none',
cellnote = format(p_vals$pval_df, digits = 2),
notecex = 0.75,
notecol = 'black',
main = paste(y, 'p-values'),
key = F
)
print(
paste(
"Levels w/ significantly different",
y,
"than another level:"
)
)
## [1] "Levels w/ significantly different log(SalePrice) than another level:"
print(p_vals$signif_levels)
## character(0)
No need to look further as this will be dropped in favor of total bsmt sf.
x = 'BsmtUnfSF'
summary(val_train_Xy[x])
## BsmtUnfSF
## Min. : 0.0
## 1st Qu.: 234.0
## Median : 470.0
## Mean : 564.6
## 3rd Qu.: 795.5
## Max. :2046.0
sum_and_trans_cont(
data = val_train_Xy,
x = x,
func = best_normalizers[[x]]$best_func$func,
func_name = best_normalizers[[x]]$best_func$name,
x_binw = 50,
t_binw = 0.25
)
## NULL
x = 'cbrt(BsmtUnfSF)'
val_train_Xy = val_train_Xy %>%
mutate('cbrt(BsmtUnfSF)' = BsmtUnfSF^(1/3))
# Recalculate best normalizers.
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
best_normalizers = find_best_normalizer_per_feat(
df = val_train_Xy,
feats_lst = num_feats,
funcs_lst = funcs_lst,
exclude_vals = list(0)
)
summary(val_train_Xy[x])
## cbrt(BsmtUnfSF)
## Min. : 0.000
## 1st Qu.: 6.162
## Median : 7.775
## Mean : 7.402
## 3rd Qu.: 9.266
## Max. :12.695
sum_and_trans_cont(
data = filter(val_train_Xy, BsmtUnfSF != 0),
x = x,
func = best_normalizers[[x]]$best_func$func,
func_name = best_normalizers[[x]]$best_func$name,
x_binw = 0.25,
t_binw = 0.25
)
## NULL
Because this variable’s 0s indicate a missing basement, just Winsorizing non-zero values.
df = val_train_Xy[val_train_Xy$BsmtUnfSF != 0, ]
qqnorm(y = df$BsmtUnfSF, ylab = 'BsmtUnfSF')
qqline(y = df$BsmtUnfSF, ylab = 'BsmtUnfSF')
qqnorm(y = df[[x]], ylab = x)
qqline(y = df[[x]], ylab = x)
Win_cbrt_x = Winsorize(
x = df[[x]],
probs = c(0.05, 0.95),
na.rm = T
)
qqnorm(y = Win_cbrt_x, ylab = 'Win(cbrt(BsmtUnfSF)')
qqline(y = Win_cbrt_x, ylab = 'Win(cbrt(BsmtUnfSF)')
Win_raw_x = Winsorize(
x = df$BsmtUnfSF,
probs = c(0, 0.95),
na.rm = T
)
qqnorm(y = Win_raw_x, ylab = 'BsmtUnfSF')
qqline(y = Win_raw_x, ylab = 'BsmtUnfSF')
print(shapiro.test(x = df$BsmtUnfSF))
##
## Shapiro-Wilk normality test
##
## data: df$BsmtUnfSF
## W = 0.92795, p-value < 2.2e-16
print(shapiro.test(x = df$`cbrt(BsmtUnfSF)`))
##
## Shapiro-Wilk normality test
##
## data: df$`cbrt(BsmtUnfSF)`
## W = 0.993, p-value = 0.003542
print(shapiro.test(x = Win_cbrt_x))
##
## Shapiro-Wilk normality test
##
## data: Win_cbrt_x
## W = 0.96978, p-value = 2.054e-10
print(shapiro.test(x = Win_raw_x))
##
## Shapiro-Wilk normality test
##
## data: Win_raw_x
## W = 0.93316, p-value < 2.2e-16
x = 'cbrt(BsmtUnfSF)'
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
x_lst = c('BsmtUnfSF', 'cbrt(BsmtUnfSF)')
df = get_cors(
data = filter(
select(val_train_Xy, all_of(num_feats)),
!is.na(.data[[x]])
),
x_lst = x_lst,
feats = num_feats
)
df
print("Summary of absolute values of Pearson's Rs:")
## [1] "Summary of absolute values of Pearson's Rs:"
df = abs(df)
summary(abs(df))
## BsmtUnfSF cbrt(BsmtUnfSF)
## Min. :0.007563 Min. :0.007861
## 1st Qu.:0.042330 1st Qu.:0.039367
## Median :0.137369 Median :0.128333
## Mean :0.135120 Mean :0.122909
## 3rd Qu.:0.180533 3rd Qu.:0.165133
## Max. :0.473352 Max. :0.356293
df = melt(df)
ggplot(df, aes(x = variable, y = value)) +
geom_boxplot(notch = T) +
ylab(label = 'Absolute Value of Correlation to Other Features')
df = get_cors(
data = filter(
select(val_train_Xy, all_of(num_feats)),
.data[[x]] != 0
),
x_lst = x_lst,
feats = num_feats
)
df
print("Summary of absolute values of Pearson's Rs (no 0s):")
## [1] "Summary of absolute values of Pearson's Rs (no 0s):"
df = abs(df)
summary(abs(df))
## BsmtUnfSF cbrt(BsmtUnfSF)
## Min. :0.008363 Min. :0.001348
## 1st Qu.:0.055197 1st Qu.:0.045389
## Median :0.101256 Median :0.078388
## Mean :0.127480 Mean :0.108878
## 3rd Qu.:0.156947 3rd Qu.:0.141344
## Max. :0.534817 Max. :0.539121
df = melt(df)
ggplot(df, aes(x = variable, y = value)) +
geom_boxplot(notch = T) +
ylab(label = 'Absolute Value of Correlation to Other Features (no 0s)')
y_lst = c('log(SalePrice)')
for (feat in x_lst) {
plot_scat_pairs(df = val_train_Xy, x = feat, y_lst = y_lst)
}
x = 'TotalBsmtSF'
summary(val_train_Xy[x])
## TotalBsmtSF
## Min. : 0.0
## 1st Qu.: 793.5
## Median : 981.0
## Mean :1058.1
## 3rd Qu.:1291.0
## Max. :6110.0
sum_and_trans_cont(
data = val_train_Xy,
x = x,
func = best_normalizers[[x]]$best_func$func,
func_name = best_normalizers[[x]]$best_func$name,
x_binw = 50,
t_binw = 1/20
)
## NULL
x = 'log(TotalBsmtSF)'
val_train_Xy = val_train_Xy %>%
mutate(
'log(TotalBsmtSF)' = ifelse(
TotalBsmtSF <= 0,
0,
log(TotalBsmtSF)
)
)
# Recalculate best normalizers.
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
best_normalizers = find_best_normalizer_per_feat(
df = val_train_Xy,
feats_lst = num_feats,
funcs_lst = funcs_lst,
exclude_vals = list(0)
)
summary(val_train_Xy[x])
## log(TotalBsmtSF)
## Min. :0.000
## 1st Qu.:6.676
## Median :6.889
## Mean :6.734
## 3rd Qu.:7.163
## Max. :8.718
sum_and_trans_cont(
data = val_train_Xy,
x = x,
func = best_normalizers[[x]]$best_func$func,
func_name = best_normalizers[[x]]$best_func$name,
x_binw = 1/20,
t_binw = 1
)
## NULL
x = 'square(log(TotalBsmtSF))'
val_train_Xy = val_train_Xy %>%
mutate(
'square(log(TotalBsmtSF))' = ifelse(
TotalBsmtSF <= 0,
0,
log(TotalBsmtSF)^2
)
)
# Recalculate best normalizers.
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
best_normalizers = find_best_normalizer_per_feat(
df = val_train_Xy,
feats_lst = num_feats,
funcs_lst = funcs_lst,
exclude_vals = list(0)
)
summary(val_train_Xy[x])
## square(log(TotalBsmtSF))
## Min. : 0.00
## 1st Qu.:44.58
## Median :47.45
## Mean :46.77
## 3rd Qu.:51.31
## Max. :76.00
sum_and_trans_cont(
data = val_train_Xy,
x = x,
func = best_normalizers[[x]]$best_func$func,
func_name = best_normalizers[[x]]$best_func$name,
x_binw = 1,
t_binw = 1
)
## NULL
Just checking non-zero set.
df = val_train_Xy[val_train_Xy$TotalBsmtSF != 0, ]
qqnorm(y = df$TotalBsmtSF, ylab = 'TotalBsmtSF')
qqline(y = df$TotalBsmtSF, ylab = 'TotalBsmtSF')
qqnorm(y = df[[x]], ylab = x)
qqline(y = df[[x]], ylab = x)
Win_log_x_squared = Winsorize(
x = df[[x]],
probs = c(0.005, 0.995),
na.rm = T
)
qqnorm(y = Win_log_x_squared, ylab = 'Win_log_x_squared')
qqline(y = Win_log_x_squared, ylab = 'Win_log_x_squared')
Win_raw_x = Winsorize(
x = df$TotalBsmtSF,
probs = c(0, 0.99),
na.rm = T
)
qqnorm(y = Win_raw_x, ylab = 'Win_raw_x')
qqline(y = Win_raw_x, ylab = 'Win_raw_x')
print(shapiro.test(x = df$TotalBsmtSF))
##
## Shapiro-Wilk normality test
##
## data: df$TotalBsmtSF
## W = 0.84661, p-value < 2.2e-16
print(shapiro.test(x= df$`square(log(TotalBsmtSF))`))
##
## Shapiro-Wilk normality test
##
## data: df$`square(log(TotalBsmtSF))`
## W = 0.98181, p-value = 1.344e-07
print(shapiro.test(x = Win_log_x_squared))
##
## Shapiro-Wilk normality test
##
## data: Win_log_x_squared
## W = 0.99018, p-value = 0.0001361
print(shapiro.test(x = Win_raw_x))
##
## Shapiro-Wilk normality test
##
## data: Win_raw_x
## W = 0.9591, p-value = 5.387e-13
x = 'Win(square(log(TotalBsmtSF)))'
val_train_Xy = val_train_Xy %>%
mutate(
'Win(square(log(TotalBsmtSF)))' = ifelse(
TotalBsmtSF <= 0,
0,
Winsorize(
x = log(TotalBsmtSF)^2,
probs = c(0.005, 0.995),
na.rm = T
)
)
) %>%
mutate(
'Win(log(TotalBsmtSF))' = ifelse(
TotalBsmtSF <= 0,
0,
Winsorize(
x = log(TotalBsmtSF),
probs = c(0.005, 0.995),
na.rm = T
)
)
)
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
x_lst = c('TotalBsmtSF', 'log(TotalBsmtSF)', 'square(log(TotalBsmtSF))',
'Win(square(log(TotalBsmtSF)))')
df = get_cors(
data = filter(
select(val_train_Xy, all_of(num_feats)),
!is.na(.data[[x]])
),
x_lst = x_lst,
feats = num_feats
)
df
print("Summary of absolute values of Pearson's Rs:")
## [1] "Summary of absolute values of Pearson's Rs:"
df = abs(df)
summary(abs(df))
## TotalBsmtSF log(TotalBsmtSF) square(log(TotalBsmtSF))
## Min. :0.001736 Min. :0.001862 Min. :0.004075
## 1st Qu.:0.111764 1st Qu.:0.067788 1st Qu.:0.076938
## Median :0.352060 Median :0.155643 Median :0.226709
## Mean :0.306997 Mean :0.178743 Mean :0.233972
## 3rd Qu.:0.404087 3rd Qu.:0.229931 3rd Qu.:0.300477
## Max. :0.823514 Max. :0.999521 Max. :0.965497
## Win(square(log(TotalBsmtSF)))
## Min. :0.003976
## 1st Qu.:0.076883
## Median :0.226443
## Mean :0.233493
## 3rd Qu.:0.299516
## Max. :0.966195
df = melt(df)
ggplot(df, aes(x = variable, y = value)) +
geom_boxplot(notch = T) +
ylab(label = 'Absolute Value of Correlation to Other Features')
df = get_cors(
data = filter(
select(val_train_Xy, all_of(num_feats)),
.data[[x]] != 0
),
x_lst = x_lst,
feats = num_feats
)
df
print("Summary of absolute values of Pearson's Rs (no 0s):")
## [1] "Summary of absolute values of Pearson's Rs (no 0s):"
df = abs(df)
summary(abs(df))
## TotalBsmtSF log(TotalBsmtSF) square(log(TotalBsmtSF))
## Min. :0.002517 Min. :0.006178 Min. :0.004722
## 1st Qu.:0.143176 1st Qu.:0.128517 1st Qu.:0.135745
## Median :0.331959 Median :0.323649 Median :0.326473
## Mean :0.315899 Mean :0.310116 Mean :0.313815
## 3rd Qu.:0.407319 3rd Qu.:0.414363 3rd Qu.:0.418140
## Max. :0.903203 Max. :0.994652 Max. :0.990654
## Win(square(log(TotalBsmtSF)))
## Min. :0.005277
## 1st Qu.:0.136215
## Median :0.327097
## Mean :0.314560
## 3rd Qu.:0.420007
## Max. :0.988335
df = melt(df)
ggplot(df, aes(x = variable, y = value)) +
geom_boxplot(notch = T) +
ylab(label = 'Absolute Value of Correlation to Other Features') +
xlab(label = 'Subset with no 0s')
y_lst = c('log(SalePrice)')
for (feat in x_lst) {
plot_scat_pairs(df = val_train_Xy, x = feat, y_lst = y_lst)
}
Looks to be some clustering here, two or three groups. Maybe those with and without a second basement, maybe basement types. I think regression will suss out what information is needed, but it’s worth noting.
The raw feature actually correlates much better with the target variable when 0s are included. It might be worth keeping the raw features for the Lasso regression to weed through. Normalizing apart from 0s may or may not help other regressions; it would be ideal for some kind of combo regression that splits/clusters then finds a linear regression, which is sort of what I’m attempting by feeding the binary version of these features to Lasso (where not already encoded by a factor). Normalizing apart for 0s is not likely to help decision trees much, but may help KNN a little by pulling 0s farther away and outliers closer.
x = 'Win(square(log(TotalBsmtSF)))'
min_val = min(Win_log_x_squared)
max_val = max(Win_log_x_squared)
print(paste("min_val:", min_val))
## [1] "min_val: 32.6597927030784"
print(paste("max_val:", max_val))
## [1] "max_val: 60.3486636755867"
val_train_Xy = val_train_Xy %>%
mutate(
'Win(square(log(TotalBsmtSF)))' = ifelse(
TotalBsmtSF == 0,
0,
Winsorize(
log(TotalBsmtSF)^2,
# probs = c(0.005, 0.995),
minval = min_val,
maxval = max_val
)
)
) %>%
select(-c('Win(log(TotalBsmtSF))', 'log(TotalBsmtSF)'))
gg = ggplot(val_train_Xy, aes(x = .data[[x]]))
p1 = gg + geom_histogram(binwidth = 1)
p2 = gg + geom_boxplot(notch = T)
grid.arrange(p1, p2)
I can create a binary for basement when I one-hot encode during modeling, but I want to be able to use it for exploration here.
val_train_Xy = val_train_Xy %>%
mutate('Bsmt.bin' = factor(ifelse(TotalBsmtSF == 0, 0, 1), ordered = T))
x = 'Bsmt.bin'
y = 'SalePrice'
summarize_by(data = val_train_Xy, x = x, y = y)
y = 'log(SalePrice)'
sum_and_trans_fact(data = val_train_Xy, x = x, y = y)
## NULL
p_vals = get_signif_levels(data = val_train_Xy, x = x, z = y, min_n = 20)
heatmap.2(
x = as.matrix(p_vals$pval_df),
scale = 'none',
Rowv = F,
Colv = F,
dendrogram = 'none',
cellnote = format(p_vals$pval_df, digits = 2),
notecex = 0.75,
notecol = 'black',
main = paste(y, 'p-values'),
key = F
)
print(
paste(
"Levels w/ significantly different",
y,
"than another level:"
)
)
## [1] "Levels w/ significantly different log(SalePrice) than another level:"
print(p_vals$signif_levels)
## [1] "1" "0"
val_train_Xy = select(val_train_Xy, -c('Bsmt.bin'))
TotalBsmtSF - BsmtUnfSF
x = 'TotalBsmtFinSF'
val_train_Xy = val_train_Xy %>%
mutate('TotalBsmtFinSF' = TotalBsmtSF - BsmtUnfSF)
# Recalculate best normalizers.
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
best_normalizers = find_best_normalizer_per_feat(
df = val_train_Xy,
feats_lst = num_feats,
funcs_lst = funcs_lst,
exclude_vals = list(0)
)
summary(val_train_Xy[x])
## TotalBsmtFinSF
## Min. : 0.0
## 1st Qu.: 0.0
## Median : 456.0
## Mean : 493.6
## 3rd Qu.: 786.0
## Max. :5644.0
sum_and_trans_cont(
data = val_train_Xy[val_train_Xy[[x]] != 0, ],
x = x,
func = best_normalizers[[x]]$best_func$func,
func_name = best_normalizers[[x]]$best_func$name,
x_binw = 50,
t_binw = 1
)
## NULL
x = 'sqrt(TotalBsmtFinSF)'
val_train_Xy = val_train_Xy %>%
mutate('sqrt(TotalBsmtFinSF)' = sqrt(TotalBsmtFinSF))
# Recalculate best normalizers.
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
best_normalizers = find_best_normalizer_per_feat(
df = val_train_Xy,
feats_lst = num_feats,
funcs_lst = funcs_lst,
exclude_vals = list(0)
)
summary(val_train_Xy[x])
## sqrt(TotalBsmtFinSF)
## Min. : 0.00
## 1st Qu.: 0.00
## Median :21.35
## Mean :17.39
## 3rd Qu.:28.04
## Max. :75.13
sum_and_trans_cont(
data = val_train_Xy[val_train_Xy[[x]] != 0, ],
x = x,
func = best_normalizers[[x]]$best_func$func,
func_name = best_normalizers[[x]]$best_func$name,
x_binw = 1,
t_binw = 1
)
## NULL
Just the non-zero set.
x = 'sqrt(TotalBsmtFinSF)'
df = val_train_Xy[val_train_Xy$TotalBsmtFinSF != 0, ]
qqnorm(y = df$TotalBsmtFinSF, ylab = 'TotalBsmtFinSF')
qqline(y = df$TotalBsmtFinSF, ylab = 'TotalBsmtFinSF')
qqnorm(y = df[[x]], ylab = x)
qqline(y = df[[x]], ylab = x)
Win_sqrt_x = Winsorize(
x = df[[x]],
probs = c(0.03, 0.995),
na.rm = T
)
qqnorm(y = Win_sqrt_x, ylab = 'Win_sqrt_x')
qqline(y = Win_sqrt_x, ylab = 'Win_sqrt_x')
Win_raw_x = Winsorize(
x = df$TotalBsmtFinSF,
probs = c(0.001, 0.99),
na.rm = T
)
qqnorm(y = Win_raw_x, ylab = 'Win(TotalBsmtFinSF)')
qqline(y = Win_raw_x, ylab = 'Win(TotalBsmtFinSF)')
print(shapiro.test(x = df$TotalBsmtFinSF))
##
## Shapiro-Wilk normality test
##
## data: df$TotalBsmtFinSF
## W = 0.83537, p-value < 2.2e-16
print(shapiro.test(x = df$`sqrt(TotalBsmtFinSF)`))
##
## Shapiro-Wilk normality test
##
## data: df$`sqrt(TotalBsmtFinSF)`
## W = 0.97064, p-value = 3.291e-08
print(shapiro.test(x = Win_sqrt_x))
##
## Shapiro-Wilk normality test
##
## data: Win_sqrt_x
## W = 0.99214, p-value = 0.01256
print(shapiro.test(x = Win_raw_x))
##
## Shapiro-Wilk normality test
##
## data: Win_raw_x
## W = 0.97166, p-value = 5.265e-08
val_train_Xy = val_train_Xy %>%
mutate(
'Win(sqrt(TotalBsmtFinSF))' = ifelse(
TotalBsmtFinSF == 0,
0,
Winsorize(
x = sqrt(TotalBsmtFinSF),
# probs = c(0.01, 0.995),
minval = min(Win_sqrt_x),
maxval = max(Win_sqrt_x)
)
)
) %>%
mutate(
'Win(TotalBsmtFinSF)' = ifelse(
TotalBsmtFinSF == 0,
0,
Winsorize(
x = TotalBsmtFinSF,
# probs = c(0.001, 0.99)
minval = min(Win_raw_x),
maxval = max(Win_raw_x)
)
)
)
x = 'Win(sqrt(TotalBsmtFinSF))'
num_feats = colnames(select(val_train_Xy, where(is.numeric)))
x_lst = c('TotalBsmtFinSF', 'sqrt(TotalBsmtFinSF)', x)
df = get_cors(
data = filter(
select(val_train_Xy, all_of(num_feats)),
!is.na(.data[[x]])
),
x_lst = x_lst,
feats = num_feats
)
df
print("Summary of absolute values of Pearson's Rs:")
## [1] "Summary of absolute values of Pearson's Rs:"
df = abs(df)
summary(abs(df))
## TotalBsmtFinSF sqrt(TotalBsmtFinSF) Win(sqrt(TotalBsmtFinSF))
## Min. :0.0004767 Min. :0.007371 Min. :0.008608
## 1st Qu.:0.0941765 1st Qu.:0.093248 1st Qu.:0.098192
## Median :0.2387013 Median :0.166897 Median :0.155155
## Mean :0.2619773 Mean :0.222351 Mean :0.218137
## 3rd Qu.:0.3211208 3rd Qu.:0.277837 3rd Qu.:0.277832
## Max. :0.9567699 Max. :0.957414 Max. :0.955968
df = melt(df)
ggplot(df, aes(x = variable, y = value)) +
geom_boxplot(notch = T) +
ylab(label = 'Absolute Value of Correlation to Other Features')
df = get_cors(
data = filter(
select(val_train_Xy, all_of(num_feats)),
.data[[x]] != 0
),
x_lst = x_lst,
feats = num_feats
)
df
print("Summary of absolute values of Pearson's Rs (no 0s):")
## [1] "Summary of absolute values of Pearson's Rs (no 0s):"
df = abs(df)
summary(abs(df))
## TotalBsmtFinSF sqrt(TotalBsmtFinSF) Win(sqrt(TotalBsmtFinSF))
## Min. :0.002897 Min. :0.0002521 Min. :0.0187
## 1st Qu.:0.112971 1st Qu.:0.1000694 1st Qu.:0.1153
## Median :0.274222 Median :0.2348650 Median :0.2413
## Mean :0.295891 Mean :0.2677646 Mean :0.2713
## 3rd Qu.:0.388785 3rd Qu.:0.3526043 3rd Qu.:0.3546
## Max. :0.917361 Max. :0.9644098 Max. :0.9887
df = melt(df)
ggplot(df, aes(x = variable, y = value)) +
geom_boxplot(notch = T) +
ylab(label = 'Absolute Value of Correlation to Other Features (no 0s)')
y_lst = c('log(SalePrice)')
for (feat in x_lst) {
plot_scat_pairs(df = val_train_Xy, x = feat, y_lst = y_lst)
}
x = 'Win(sqrt(TotalBsmtFinSF))'
min_val = min(Win_sqrt_x)
max_val = max(Win_sqrt_x)
print(paste("min_val:", min_val))
## [1] "min_val: 10.1234508028247"
print(paste("max_val:", max_val))
## [1] "max_val: 46.3884140769296"
# Already hard coded above.
val_train_Xy = select(val_train_Xy, -c('Win(TotalBsmtFinSF)'))
gg = ggplot(val_train_Xy, aes(x = .data[[x]]))
p1 = gg + geom_histogram(binwidth = 1)
p2 = gg + geom_boxplot(notch = T)
grid.arrange(p1, p2)
# Continue in EDA_pt2
val_train_Xy$Id = val_train_X$Id
saveRDS(val_train_Xy, 'data/eda_pt1_val_train_Xy.rds')
head(readRDS('data/eda_pt1_val_train_Xy.rds'))